<a href="https://colab.research.google.com/github/JayThibs/ai-safety-scrape/blob/main/scrape_ai_alignment_content.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extracting Contents from AI Alignment Resources

This notebook is used to scrape the contents of AI Alignment resources.

# Installations

In [1]:
!pip install tika arxiv pandoc -q

[?25l[K     |████                            | 10 kB 31.8 MB/s eta 0:00:01[K     |████████                        | 20 kB 43.0 MB/s eta 0:00:01[K     |████████████▏                   | 30 kB 54.2 MB/s eta 0:00:01[K     |████████████████▏               | 40 kB 60.0 MB/s eta 0:00:01[K     |████████████████████▏           | 51 kB 18.1 MB/s eta 0:00:01[K     |████████████████████████▎       | 61 kB 20.9 MB/s eta 0:00:01[K     |████████████████████████████▎   | 71 kB 23.4 MB/s eta 0:00:01[K     |████████████████████████████████| 81 kB 7.3 MB/s 
[K     |████████████████████████████████| 117 kB 84.2 MB/s 
[K     |████████████████████████████████| 49 kB 5.9 MB/s 
[?25h  Building wheel for tika (setup.py) ... [?25l[?25hdone
  Building wheel for pandoc (setup.py) ... [?25l[?25hdone
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone


# Imports

In [41]:
import os
import pandas as pd
from tika import parser
from google.colab import drive
from pathlib import Path
from urllib import request
from bs4 import BeautifulSoup
import arxiv
import pandoc
import requests
import tarfile
import pickle

# Setting up Environment

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd drive/MyDrive/

/content/drive/MyDrive


In [5]:
CODE_DIR = Path('.') / 'code-projects/gpt-ai-safety'
RAW_DIR = Path('.') / 'data/raw'
TARS_DIR = RAW_DIR / 'tars'
LATEX_DIR = RAW_DIR / 'latex_files'
PDFS_DIR = RAW_DIR / 'pdfs'

In [6]:
%cd {CODE_DIR}

/content/drive/MyDrive/code-projects/gpt-ai-safety


In [7]:
# !git clone https://github.com/JayThibs/ai-safety-scrape
# !mv ai-safety-scrape/* .

# Load and Explore Data

In [8]:
df = pd.read_csv('ai-alignment-papers.csv')
df.head()

Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,Abstract Note,Date,Date Added,Date Modified,Access Date,Pages,Num Pages,Issue,Volume,Number Of Volumes,Journal Abbreviation,Short Title,Series,Series Number,Series Text,Series Title,Publisher,Place,Language,Rights,Type,Archive,Archive Location,Library Catalog,Call Number,Extra,Notes,File Attachments,Link Attachments,Manual Tags,...,Cast Member,Commenter,Composer,Cosponsor,Counsel,Interviewer,Producer,Recipient,Reviewed Author,Scriptwriter,Words By,Guest,Number,Edition,Running Time,Scale,Medium,Artwork Size,Filing Date,Application Number,Assignee,Issuing Authority,Country,Meeting Name,Conference Name,Court,References,Reporter,Legal Status,Priority Numbers,Programming Language,Version,System,Code,Code Number,Section,Session,Committee,History,Legislative Body
0,XBZAPQFK,blogPost,2020.0,"Kokotajlo, Daniel",Three kinds of competitiveness,AI Impacts,,,,https://aiimpacts.org/three-kinds-of-competiti...,"By Daniel Kokotajlo In this post, I distinguis...",2020-03-30,2022-01-30 01:53:10,2022-01-30 01:53:10,2021-11-20 18:55:39,,,,,,,,,,,,,,en-US,,,,,,,ZSCC: NoCitationData[s0] ACC: N/A Section: Blog,,/Users/jacquesthibodeau/Zotero/storage/PU9A2KS...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,HX9UZ5JP,journalArticle,2020.0,"Cihon, Peter; Maas, Matthijs M.; Kemp, Luke",Fragmentation and the Future: Investigating Ar...,Global Policy,,1758-5899,10.1111/1758-5899.12890,https://onlinelibrary.wiley.com/doi/abs/10.111...,The international governance of artificial int...,2020,2022-01-30 04:47:43,2022-01-30 04:47:43,2021-11-13 15:58:24,545-556,,5.0,11.0,,,Fragmentation and the Future,,,,,,,en,,,,,Wiley Online Library,,ZSCC: 0000010 _eprint: https://onlinelibrary....,,/Users/jacquesthibodeau/Zotero/storage/2TZBI3F...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,BQCZM53S,blogPost,2021.0,"Clarke, Sam; Martin, Samuel Dylan",Distinguishing AI takeover scenarios,AI Alignment Forum,,,,https://www.alignmentforum.org/posts/qYzqDtoQa...,Epistemic status: lots of this involves interp...,2021-09-08,2022-01-30 04:47:42,2022-01-30 04:47:42,2021-11-18 23:45:23,,,,,,,,,,,,,,,,,,,,,ZSCC: NoCitationData[s0] ACC: N/A,,/Users/jacquesthibodeau/Zotero/storage/ENAMQXC...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,JVMJ4RMM,journalArticle,2020.0,"Stray, Jonathan",Aligning AI Optimization to Community Well-Being,International Journal of Community Well-Being,,"2524-5295, 2524-5309",10.1007/s42413-020-00086-3,http://link.springer.com/10.1007/s42413-020-00...,,2020-12,2022-01-30 04:47:36,2022-01-30 04:47:36,2021-11-13 22:47:54,443-463,,4.0,3.0,,Int. Journal of Com. WB,,,,,,,,en,,,,,DOI.org (Crossref),,ZSCC: 0000010,,/Users/jacquesthibodeau/Zotero/storage/V3BEV7X...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,W8F6VI9I,thesis,2020.0,"Shah, Rohin Monish",Extracting and Using Preference Information fr...,,,,,https://www.proquest.com/openview/da8bf63ef343...,Typically when learning about what people want...,2020-12-17,2022-01-30 04:47:35,2022-01-30 04:47:35,,,24.0,,,,,,,,,,"University of California, Berkeley","Berkeley, CA",en,,,,,Zotero,,ZSCC: 0000000,,/Users/jacquesthibodeau/Zotero/storage/S96M3KT...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [9]:
df['Item Type'].unique()

array(['blogPost', 'journalArticle', 'thesis', 'conferencePaper',
       'manuscript', 'report', 'bookSection', 'magazineArticle', 'book'],
      dtype=object)

In [10]:
item_nums = []
for item in df['Item Type'].unique():
    item_nums.append([item, len(df[df['Item Type'] == item])])

item_nums.sort(key=lambda x:x[1])
item_nums

[['magazineArticle', 2],
 ['thesis', 3],
 ['book', 13],
 ['bookSection', 52],
 ['report', 87],
 ['manuscript', 154],
 ['journalArticle', 170],
 ['conferencePaper', 262],
 ['blogPost', 421]]

In [11]:
df[df['Item Type'] == 'journalArticle'].head()

Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,Abstract Note,Date,Date Added,Date Modified,Access Date,Pages,Num Pages,Issue,Volume,Number Of Volumes,Journal Abbreviation,Short Title,Series,Series Number,Series Text,Series Title,Publisher,Place,Language,Rights,Type,Archive,Archive Location,Library Catalog,Call Number,Extra,Notes,File Attachments,Link Attachments,Manual Tags,...,Cast Member,Commenter,Composer,Cosponsor,Counsel,Interviewer,Producer,Recipient,Reviewed Author,Scriptwriter,Words By,Guest,Number,Edition,Running Time,Scale,Medium,Artwork Size,Filing Date,Application Number,Assignee,Issuing Authority,Country,Meeting Name,Conference Name,Court,References,Reporter,Legal Status,Priority Numbers,Programming Language,Version,System,Code,Code Number,Section,Session,Committee,History,Legislative Body
1,HX9UZ5JP,journalArticle,2020.0,"Cihon, Peter; Maas, Matthijs M.; Kemp, Luke",Fragmentation and the Future: Investigating Ar...,Global Policy,,1758-5899,10.1111/1758-5899.12890,https://onlinelibrary.wiley.com/doi/abs/10.111...,The international governance of artificial int...,2020,2022-01-30 04:47:43,2022-01-30 04:47:43,2021-11-13 15:58:24,545-556,,5,11.0,,,Fragmentation and the Future,,,,,,,en,,,,,Wiley Online Library,,ZSCC: 0000010 _eprint: https://onlinelibrary....,,/Users/jacquesthibodeau/Zotero/storage/2TZBI3F...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,JVMJ4RMM,journalArticle,2020.0,"Stray, Jonathan",Aligning AI Optimization to Community Well-Being,International Journal of Community Well-Being,,"2524-5295, 2524-5309",10.1007/s42413-020-00086-3,http://link.springer.com/10.1007/s42413-020-00...,,2020-12,2022-01-30 04:47:36,2022-01-30 04:47:36,2021-11-13 22:47:54,443-463,,4,3.0,,Int. Journal of Com. WB,,,,,,,,en,,,,,DOI.org (Crossref),,ZSCC: 0000010,,/Users/jacquesthibodeau/Zotero/storage/V3BEV7X...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
11,TK5F29IU,journalArticle,2021.0,"Hayden, Benjamin; Niv, Yael",The case against economic values in the orbito...,Behavioral Neuroscience,,,10.1037/bne0000448,https://osf.io/7hgup,Much of traditional neuroeconomics proceeds fr...,2021,2022-01-30 04:48:47,2022-01-30 04:48:47,2021-11-08 23:41:47,192-201,,2,135.0,,,,,,,,,,,,,,,DOI.org (Crossref),,ZSCC: 0000026 DOI: 10.31234/osf.io/7hgup,,,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25,NHWZIKZ2,journalArticle,2020.0,"Fernandes, Pedro; Santos, Francisco C.; Lopes,...",Norms for Beneficial A.I.: A Computational Ana...,AI Communications,,"18758452, 09217126",10.3233/AIC-201502,http://arxiv.org/abs/1907.03843,The rise of artificial intelligence (A.I.) bas...,2020-12-18,2022-01-30 04:48:46,2022-01-30 04:48:46,2021-11-13 22:40:37,155-171,,3-6,33.0,,AIC,Norms for Beneficial A.I.,,,,,,,,,,,,arXiv.org,,ZSCC: 0000004 arXiv: 1907.03843,,/Users/jacquesthibodeau/Zotero/storage/JAVXSVN...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
30,HDWGJGAP,journalArticle,2021.0,"Mingard, Chris; Valle-Pérez, Guillermo; Skalse...","Is SGD a Bayesian sampler? Well, almost",Journal of Machine Learning Research,,,,http://arxiv.org/abs/2006.15191,Overparameterised deep neural networks (DNNs) ...,2021-02,2022-01-30 04:48:46,2022-01-30 04:48:46,2021-11-13 22:56:31,,,,22.0,,,Is SGD a Bayesian sampler?,,,,,,,,,,,,arXiv.org,,ZSCC: 0000009 arXiv: 2006.15191,,/Users/jacquesthibodeau/Zotero/storage/ACV9IXE...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Extract text from AI Alignment Resources

In [34]:
df_arxiv = df[df['Url'].str.contains('arxiv') == True]
arxiv_paper_list = df_arxiv['Url'].values
arxiv_paper_list[0:10]

array(['http://arxiv.org/abs/2002.11328',
       'http://arxiv.org/abs/2010.11645',
       'http://arxiv.org/abs/2002.11708',
       'http://arxiv.org/abs/2012.10800',
       'http://arxiv.org/abs/2011.08512',
       'http://arxiv.org/abs/2002.11089',
       'http://arxiv.org/abs/2012.05876',
       'http://arxiv.org/abs/2010.14603',
       'http://arxiv.org/abs/2005.01643',
       'http://arxiv.org/abs/1907.03843'], dtype=object)

In [35]:
# # Grabbing files from ar5iv in case source (LaTeX doesn't work)

# def grab_text_from_webpage(url):
#     with request.urlopen(url) as response:
#         html = response.read()
#     soup = BeautifulSoup(html, "html.parser")
#     return soup.get_text(separator=" ")

# df_arxiv['Url'].str.replace('arxiv', 'ar5iv')
# arxiv_paper = grab_text_from_webpage('http://ar5iv.org/abs/2002.11328')
# arxiv_paper_list = arxiv_paper.split('\n\n')
# list_text = BeautifulSoup(arxiv_paper, "html.parser").get_text(separator=" ").split('\n')
# ''.join(list_text)

## Extracting PDFs with Tika

In [13]:
def tikaTextExtractor(file_path):
    """Extracts text from a PDF using tika."""
    print("Extracting text from file: " + file_path)
    parsed_tika = parser.from_file(file_path)
    return parsed_tika["content"]

In [14]:
txt = tikaTextExtractor(str(RAW_DIR / 'pdfs' / 'Superintelligence.pdf'))

Extracting text from file: data/raw/pdfs/Superintelligence.pdf


2022-02-19 02:51:47,757 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.24/tika-server-1.24.jar to /tmp/tika-server.jar.
2022-02-19 02:51:48,259 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.24/tika-server-1.24.jar.md5 to /tmp/tika-server.jar.md5.
2022-02-19 02:51:48,654 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


In [None]:
with open(RAW_DIR / 'txts' / 'Superintelligence.txt', 'w') as f:
    f.write(txt)

In [None]:
with request.urlopen('https://arxiv.org/e-print/2010.11645') as response:
    html = response.read()

In [None]:
for i,link in enumerate(['https://arxiv.org/e-print/2010.11645']):
    r = requests.get(link,allow_redirects=True)
    with open(f"a{i}.","wb") as code:
        code.write(r.content)

## Extract LaTeX papers from arXiv

In [15]:
paper = next(arxiv.Search(id_list=["2101.00027v1"]).results())

In [20]:
# paper.download_source(dirpath=TARS_DIR, filename="2101.00027v1.tar.gz")

'data/raw/tars/2101.00027v1.tar.gz'

In [19]:
paper.entry_id

'http://arxiv.org/abs/2101.00027v1'

In [36]:
if not os.path.exists(TARS_DIR / "211.00027v1.tar.gz"):
    print('does not exists')

does not exists


In [39]:
def download_arxiv_paper_tar(paper_id, tar_filename):
    paper = next(arxiv.Search(id_list=[paper_id]).results())
    if not os.path.exists(str(TARS_DIR / tar_filename)):
        paper.download_source(dirpath=TARS_DIR, filename=tar_filename)

def extract_tex(tar_filename):
    tar = tarfile.open(str(TARS_DIR / tar_filename))
    for member in tar.getmembers():
        if ".tex" in member.name:
            tex_filename = member.name
            tar.extract(member, LATEX_DIR)
    tar.close()
    return tex_filename

In [44]:
pickled_arxiv_list = str(RAW_DIR / "arxiv_ids_and_names.pkl")

if not os.path.exists(pickled_arxiv_list):
    arxiv_ids_and_names = []
else:
    pickle.load(pickled_arxiv_list)

In [43]:
for paper_id in arxiv_paper_list:
    paper_id = paper_id.split('/')[-1] + "v1"
    tar_filename = paper_id + ".tar.gz"
    download_arxiv_paper_tar(paper_id, tar_filename)
    if not os.path.exists(str(LATEX_DIR / tar_filename)):
        tex_filename = extract_tex(tar_filename)
    arxiv_ids_and_names.append([paper_id, tex_filename])


editor.tex
ms.tex
TexFiles/Theory.tex
TexFiles/Introduction.tex
TexFiles/Experiments.tex
TexFiles/Appendix.tex


NameError: ignored

In [None]:
with open('arxiv_ids_and_names.pkl', 'wb') as b:
    pickle.dump(arxiv_ids_and_names, b)

## Convert LaTeX or EPUB with Pandoc

In [None]:
pandoc.read(file='/content/drive/MyDrive/code-projects/gpt-ai-safety/2006.15191')

ProcessExecutionError: ignored