In [1]:
import os
import sys
sys.path.append("../")

import pandas as pd
from tqdm import tqdm

from langchain.document_loaders import PyPDFLoader

from utils.arxiv_utils import get_inspire_hep_papers, extract_arxiv_ids, download_arxiv_source, remove_latex_preamble
from utils.db_utils import update_dataframe, delete_files_except_extensions, get_filenames_with_extensions, scrape_website_text

%load_ext autoreload
%autoreload 2

In [2]:
victim = "Oppie"
victim_inspire_ID = "J.Robert.Oppenheimer.1"
cutoff_year = None

pdf_dir = f'../data/{victim}/papers/'  # directory to store PDFs
db_dir = f'../data/{victim}/db/'  # directory to store database CSVs
txt_dir = f'../data/{victim}/interviews/'  # directory to store interview transcripts

try:
    os.makedirs(pdf_dir)
except FileExistsError:
    print(f"Directory '{pdf_dir}' already exists")

try:
    os.makedirs(db_dir)
except FileExistsError:
    print(f"Directory '{db_dir}' already exists")

try:
    os.makedirs(txt_dir)
except FileExistsError:
    print(f"Directory '{txt_dir}' already exists")

Directory '../data/Oppie/papers/' already exists
Directory '../data/Oppie/db/' already exists
Directory '../data/Oppie/interviews/' already exists


## 1. Papers

In [3]:



# Get papers from INSPIRE-HEP, with a year cutoff
papers = get_inspire_hep_papers(victim_inspire_ID,year_cutoff= 1930) 
print(papers)

# Extract arXiv IDs from papers
arxiv_ids = extract_arxiv_ids(papers) 

# Download papers (sources if available, otherwise PDFs)
[download_arxiv_source(arxiv_id, output_dir=pdf_dir) for arxiv_id in tqdm(arxiv_ids)];  

[{'links': {'bibtex': 'https://inspirehep.net/api/literature/47479?format=bibtex', 'latex-eu': 'https://inspirehep.net/api/literature/47479?format=latex-eu', 'latex-us': 'https://inspirehep.net/api/literature/47479?format=latex-us', 'json': 'https://inspirehep.net/api/literature/47479?format=json', 'cv': 'https://inspirehep.net/api/literature/47479?format=cv', 'citations': 'https://inspirehep.net/api/literature/?q=refersto%3Arecid%3A47479'}, 'id': '47479', 'created': '2008-09-22T00:00:00+00:00', 'metadata': {'control_number': 47479}, 'updated': '2023-03-07T06:53:03.817337+00:00'}, {'links': {'bibtex': 'https://inspirehep.net/api/literature/46695?format=bibtex', 'latex-eu': 'https://inspirehep.net/api/literature/46695?format=latex-eu', 'latex-us': 'https://inspirehep.net/api/literature/46695?format=latex-us', 'json': 'https://inspirehep.net/api/literature/46695?format=json', 'cv': 'https://inspirehep.net/api/literature/46695?format=cv', 'citations': 'https://inspirehep.net/api/literatur

0it [00:00, ?it/s]


In [4]:
# Delete all files except PDFs and TeX files; load relevant files list
delete_files_except_extensions(pdf_dir, ['.pdf', '.tex'])

In [5]:
filenames = get_filenames_with_extensions(pdf_dir, ['.tex', '.pdf'])

In [6]:
# Get text
source_type = []
text = []

for i, file in enumerate(tqdm(filenames)):
    try:
        if os.path.splitext(file)[-1] == '.pdf':
            loader = PyPDFLoader("{}/{}".format(pdf_dir, file))
            pages = loader.load_and_split()
            text.append(''.join([page.page_content for page in pages]))
            source_type.append("paper")
        elif os.path.splitext(file)[-1] == '.tex':
            with open("{}/{}".format(pdf_dir, file), 'r', encoding='iso-8859-1') as f:
                text.append(remove_latex_preamble(f.read()))
                source_type.append("paper")
    except:
        print("Error with file {}".format(file))

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:01<00:00,  2.23it/s]


In [7]:
data = [source_type, text]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

In [8]:
# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)

## 2. YouTube videos

In [9]:
# from tqdm import tqdm
# import whisper
# import pytube
# from pathlib import Path
# import subprocess
# import numpy as np

In [10]:
# videos_dir = "../data/videos/"

# try:
#     os.makedirs(videos_dir)
# except FileExistsError:
#     print(f"Directory '{videos_dir}' already exists")

In [11]:
# # Get whisper model; download weights if necessary
# whisper_model = whisper.load_model("tiny.en").to('cpu')
# options = whisper.DecodingOptions(language="en", without_timestamps=True)

# url = "https://www.youtube.com/watch?v=dqxdPNzBY0I"
# pytube_vid = pytube.YouTube(url)
# video_path_local = Path(videos_dir).resolve() / (pytube_vid.video_id+".mp4")
# pytube_vid.streams.filter(type="audio", mime_type="audio/mp4", abr="48kbps").first().download(output_path=video_path_local.parent, filename=video_path_local.name)
# video_path_local = video_path_local.with_suffix(".wav")
# result  = subprocess.run(["ffmpeg", "-i", str(video_path_local.with_suffix(".mp4")), "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", str(video_path_local)])
# transcription = whisper.transcribe(whisper_model, str(video_path_local))

## 3. Interviews

In [12]:
filenames = get_filenames_with_extensions(txt_dir, ['.txt'])

In [13]:
source_type = []
text = []

for i, file in enumerate(tqdm(filenames)):
    try:
        with open("{}/{}".format(txt_dir, file), 'r') as f:
            text.append(f.read())
            source_type.append("interview")
    except:
        print("Error with file {}".format(file))

0it [00:00, ?it/s]


In [14]:
data = [source_type, text]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)

## 4. Website/CV

In [15]:
websites = ["https://en.wikipedia.org/wiki/J._Robert_Oppenheimer", 
            "https://en.wikipedia.org/wiki/Manhattan_Project", 
            "https://en.wikipedia.org/wiki/Oppenheimer_security_hearing", 
            "https://en.wikipedia.org/wiki/American_Prometheus", 
            "https://www.ias.edu/oppenheimer-legacy", 
            "https://www.goodreads.com/author/quotes/308544.J_Robert_Oppenheimer",
            "https://en.wikipedia.org/wiki/Oppenheimer_(film)"]
text_website = [scrape_website_text(website) for website in tqdm(websites)]
text_website = [", ".join(filter(None, text.replace("\n", ",").split(","))) for text in text_website]

  0%|          | 0/7 [00:00<?, ?it/s]

100%|██████████| 7/7 [00:01<00:00,  4.01it/s]


In [16]:
data = [len(text_website) * ["website"], text_website]

# Transpose the data to have the inner lists as rows
transposed_data = list(map(list, zip(*data)))

# Column names for the DataFrame
columns = ['source_type','text']

# Create the DataFrame
df = pd.DataFrame(transposed_data, columns=columns)

# Update dataframe
df = update_dataframe('{}/df_text.csv'.format(db_dir), df)
df.to_csv('{}/df_text.csv'.format(db_dir), index=False)

In [17]:
len(df)  # Number of context objects (papers, sites, etc)
print(df)

   source_type                                               text
0        paper  (Wednesday Moraine: Ele mentary Particles; J.R...
1      website  J. Robert Oppenheimer - Wikipedia, Jump to con...
2      website  Manhattan Project - Wikipedia, Jump to content...
3      website  Oppenheimer security hearing - Wikipedia, Jump...
4      website  American Prometheus - Wikipedia, Jump to conte...
5      website  J. Robert Oppenheimer: Life,  Work,  and Legac...
6      website  J. Robert Oppenheimer Quotes  (Author of The O...
7        paper  Mev  and experimentally it was found to be 37 ...
8        paper  (Tuesday Afternoon: Theoretical Session , J. S...
9      website  J. Robert Oppenheimer - Wikipedia, Jump to con...
10     website  Manhattan Project - Wikipedia, Jump to content...
11     website  Oppenheimer security hearing - Wikipedia, Jump...
12     website  American Prometheus - Wikipedia, Jump to conte...
13     website  J. Robert Oppenheimer: Life,  Work,  and Legac...
14     web