<a href="https://colab.research.google.com/github/JayThibs/ai-safety-scrape/blob/main/scrape_ai_alignment_content.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extracting Contents from AI Alignment Resources


# Installations

In [68]:
!pip install tika



# Imports

In [85]:
import pandas as pd
from tika import parser
from google.colab import drive
from pathlib import Path
from urllib import request
from bs4 import BeautifulSoup

# Setting up Environment

In [70]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
%cd drive/MyDrive/

[Errno 2] No such file or directory: 'drive/MyDrive/'
/content/drive/MyDrive/code-projects/gpt-ai-safety


In [93]:
CODE_DIR = Path('.') / 'code-projects/gpt-ai-safety'
RAW_DIR = Path('.') / 'data/raw'

In [6]:
%cd {CODE_DIR}

/content/drive/MyDrive/code-projects/gpt-ai-safety


In [8]:
# !git clone https://github.com/JayThibs/ai-safety-scrape
# !mv ai-safety-scrape/* .

Cloning into 'ai-safety-scrape'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 17 (delta 5), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (17/17), done.


# Load and Explore Data

In [73]:
df = pd.read_csv('ai-alignment-papers.csv')
df.head()

Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,Abstract Note,Date,Date Added,Date Modified,Access Date,Pages,Num Pages,Issue,Volume,Number Of Volumes,Journal Abbreviation,Short Title,Series,Series Number,Series Text,Series Title,Publisher,Place,Language,Rights,Type,Archive,Archive Location,Library Catalog,Call Number,Extra,Notes,File Attachments,Link Attachments,Manual Tags,...,Cast Member,Commenter,Composer,Cosponsor,Counsel,Interviewer,Producer,Recipient,Reviewed Author,Scriptwriter,Words By,Guest,Number,Edition,Running Time,Scale,Medium,Artwork Size,Filing Date,Application Number,Assignee,Issuing Authority,Country,Meeting Name,Conference Name,Court,References,Reporter,Legal Status,Priority Numbers,Programming Language,Version,System,Code,Code Number,Section,Session,Committee,History,Legislative Body
0,XBZAPQFK,blogPost,2020.0,"Kokotajlo, Daniel",Three kinds of competitiveness,AI Impacts,,,,https://aiimpacts.org/three-kinds-of-competiti...,"By Daniel Kokotajlo In this post, I distinguis...",2020-03-30,2022-01-30 01:53:10,2022-01-30 01:53:10,2021-11-20 18:55:39,,,,,,,,,,,,,,en-US,,,,,,,ZSCC: NoCitationData[s0] ACC: N/A Section: Blog,,/Users/jacquesthibodeau/Zotero/storage/PU9A2KS...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,HX9UZ5JP,journalArticle,2020.0,"Cihon, Peter; Maas, Matthijs M.; Kemp, Luke",Fragmentation and the Future: Investigating Ar...,Global Policy,,1758-5899,10.1111/1758-5899.12890,https://onlinelibrary.wiley.com/doi/abs/10.111...,The international governance of artificial int...,2020,2022-01-30 04:47:43,2022-01-30 04:47:43,2021-11-13 15:58:24,545-556,,5.0,11.0,,,Fragmentation and the Future,,,,,,,en,,,,,Wiley Online Library,,ZSCC: 0000010 _eprint: https://onlinelibrary....,,/Users/jacquesthibodeau/Zotero/storage/2TZBI3F...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,BQCZM53S,blogPost,2021.0,"Clarke, Sam; Martin, Samuel Dylan",Distinguishing AI takeover scenarios,AI Alignment Forum,,,,https://www.alignmentforum.org/posts/qYzqDtoQa...,Epistemic status: lots of this involves interp...,2021-09-08,2022-01-30 04:47:42,2022-01-30 04:47:42,2021-11-18 23:45:23,,,,,,,,,,,,,,,,,,,,,ZSCC: NoCitationData[s0] ACC: N/A,,/Users/jacquesthibodeau/Zotero/storage/ENAMQXC...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,JVMJ4RMM,journalArticle,2020.0,"Stray, Jonathan",Aligning AI Optimization to Community Well-Being,International Journal of Community Well-Being,,"2524-5295, 2524-5309",10.1007/s42413-020-00086-3,http://link.springer.com/10.1007/s42413-020-00...,,2020-12,2022-01-30 04:47:36,2022-01-30 04:47:36,2021-11-13 22:47:54,443-463,,4.0,3.0,,Int. Journal of Com. WB,,,,,,,,en,,,,,DOI.org (Crossref),,ZSCC: 0000010,,/Users/jacquesthibodeau/Zotero/storage/V3BEV7X...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,W8F6VI9I,thesis,2020.0,"Shah, Rohin Monish",Extracting and Using Preference Information fr...,,,,,https://www.proquest.com/openview/da8bf63ef343...,Typically when learning about what people want...,2020-12-17,2022-01-30 04:47:35,2022-01-30 04:47:35,,,24.0,,,,,,,,,,"University of California, Berkeley","Berkeley, CA",en,,,,,Zotero,,ZSCC: 0000000,,/Users/jacquesthibodeau/Zotero/storage/S96M3KT...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [74]:
df['Item Type'].unique()

array(['blogPost', 'journalArticle', 'thesis', 'conferencePaper',
       'manuscript', 'report', 'bookSection', 'magazineArticle', 'book'],
      dtype=object)

In [75]:
item_nums = []
for item in df['Item Type'].unique():
    item_nums.append([item, len(df[df['Item Type'] == item])])

item_nums.sort(key=lambda x:x[1])
item_nums

[['magazineArticle', 2],
 ['thesis', 3],
 ['book', 13],
 ['bookSection', 52],
 ['report', 87],
 ['manuscript', 154],
 ['journalArticle', 170],
 ['conferencePaper', 262],
 ['blogPost', 421]]

In [78]:
df[df['Item Type'] == 'journalArticle'].head()

Unnamed: 0,Key,Item Type,Publication Year,Author,Title,Publication Title,ISBN,ISSN,DOI,Url,Abstract Note,Date,Date Added,Date Modified,Access Date,Pages,Num Pages,Issue,Volume,Number Of Volumes,Journal Abbreviation,Short Title,Series,Series Number,Series Text,Series Title,Publisher,Place,Language,Rights,Type,Archive,Archive Location,Library Catalog,Call Number,Extra,Notes,File Attachments,Link Attachments,Manual Tags,...,Cast Member,Commenter,Composer,Cosponsor,Counsel,Interviewer,Producer,Recipient,Reviewed Author,Scriptwriter,Words By,Guest,Number,Edition,Running Time,Scale,Medium,Artwork Size,Filing Date,Application Number,Assignee,Issuing Authority,Country,Meeting Name,Conference Name,Court,References,Reporter,Legal Status,Priority Numbers,Programming Language,Version,System,Code,Code Number,Section,Session,Committee,History,Legislative Body
1,HX9UZ5JP,journalArticle,2020.0,"Cihon, Peter; Maas, Matthijs M.; Kemp, Luke",Fragmentation and the Future: Investigating Ar...,Global Policy,,1758-5899,10.1111/1758-5899.12890,https://onlinelibrary.wiley.com/doi/abs/10.111...,The international governance of artificial int...,2020,2022-01-30 04:47:43,2022-01-30 04:47:43,2021-11-13 15:58:24,545-556,,5,11.0,,,Fragmentation and the Future,,,,,,,en,,,,,Wiley Online Library,,ZSCC: 0000010 _eprint: https://onlinelibrary....,,/Users/jacquesthibodeau/Zotero/storage/2TZBI3F...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,JVMJ4RMM,journalArticle,2020.0,"Stray, Jonathan",Aligning AI Optimization to Community Well-Being,International Journal of Community Well-Being,,"2524-5295, 2524-5309",10.1007/s42413-020-00086-3,http://link.springer.com/10.1007/s42413-020-00...,,2020-12,2022-01-30 04:47:36,2022-01-30 04:47:36,2021-11-13 22:47:54,443-463,,4,3.0,,Int. Journal of Com. WB,,,,,,,,en,,,,,DOI.org (Crossref),,ZSCC: 0000010,,/Users/jacquesthibodeau/Zotero/storage/V3BEV7X...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
11,TK5F29IU,journalArticle,2021.0,"Hayden, Benjamin; Niv, Yael",The case against economic values in the orbito...,Behavioral Neuroscience,,,10.1037/bne0000448,https://osf.io/7hgup,Much of traditional neuroeconomics proceeds fr...,2021,2022-01-30 04:48:47,2022-01-30 04:48:47,2021-11-08 23:41:47,192-201,,2,135.0,,,,,,,,,,,,,,,DOI.org (Crossref),,ZSCC: 0000026 DOI: 10.31234/osf.io/7hgup,,,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25,NHWZIKZ2,journalArticle,2020.0,"Fernandes, Pedro; Santos, Francisco C.; Lopes,...",Norms for Beneficial A.I.: A Computational Ana...,AI Communications,,"18758452, 09217126",10.3233/AIC-201502,http://arxiv.org/abs/1907.03843,The rise of artificial intelligence (A.I.) bas...,2020-12-18,2022-01-30 04:48:46,2022-01-30 04:48:46,2021-11-13 22:40:37,155-171,,3-6,33.0,,AIC,Norms for Beneficial A.I.,,,,,,,,,,,,arXiv.org,,ZSCC: 0000004 arXiv: 1907.03843,,/Users/jacquesthibodeau/Zotero/storage/JAVXSVN...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
30,HDWGJGAP,journalArticle,2021.0,"Mingard, Chris; Valle-P√©rez, Guillermo; Skalse...","Is SGD a Bayesian sampler? Well, almost",Journal of Machine Learning Research,,,,http://arxiv.org/abs/2006.15191,Overparameterised deep neural networks (DNNs) ...,2021-02,2022-01-30 04:48:46,2022-01-30 04:48:46,2021-11-13 22:56:31,,,,22.0,,,Is SGD a Bayesian sampler?,,,,,,,,,,,,arXiv.org,,ZSCC: 0000009 arXiv: 2006.15191,,/Users/jacquesthibodeau/Zotero/storage/ACV9IXE...,,UnsortedSafety,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [80]:
df_arxiv = df[df['Url'].str.contains('arxiv') == True]

In [83]:
df_arxiv['Url'].str.replace('arxiv', 'ar5iv')

5       http://ar5iv.org/abs/2002.11328
7       http://ar5iv.org/abs/2010.11645
8       http://ar5iv.org/abs/2002.11708
9       http://ar5iv.org/abs/2012.10800
14      http://ar5iv.org/abs/2011.08512
                     ...               
1150    http://ar5iv.org/abs/1811.05590
1154    http://ar5iv.org/abs/1807.08364
1156    http://ar5iv.org/abs/1704.02882
1159    http://ar5iv.org/abs/1808.04096
1160    http://ar5iv.org/abs/1709.06166
Name: Url, Length: 317, dtype: object

In [86]:
def grab_text_from_webpage(url):
    with request.urlopen(url) as response:
        html = response.read()
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text()

In [89]:
arxiv_paper = grab_text_from_webpage('http://ar5iv.org/abs/2002.11328')

In [92]:
arxiv_paper.split('\n\n')

['',
 '\n[2002.11328] Rethinking Bias-Variance Trade-off for Generalization of Neural Networks',
 '',
 '',
 '',
 '',
 '',
 '',
 '\n  function detectColorScheme(){\n    var theme="light";\n    var current_theme = localStorage.getItem("ar5iv_theme");\n    if(current_theme){\n      if(current_theme == "dark"){\n        theme = "dark";\n      } }\n    else if(!window.matchMedia) { return false; }\n    else if(window.matchMedia("(prefers-color-scheme: dark)").matches) {\n      theme = "dark"; }\n    if (theme=="dark") {\n      document.documentElement.setAttribute("data-theme", "dark");\n    } else {\n      document.documentElement.setAttribute("data-theme", "light"); } }\n  \n  detectColorScheme();\n  \n  function toggleColorScheme(){\n    var current_theme = localStorage.getItem("ar5iv_theme");\n    if (current_theme) {\n      if (current_theme == "light") {\n        localStorage.setItem("ar5iv_theme", "dark"); }\n      else {\n        localStorage.setItem("ar5iv_theme", "light"); } }\n  

# Extract text from AI Alignment Resources

In [94]:
def tikaTextExtractor(file_path):
    """Extracts text from a PDF using tika."""
    print("Extracting text from file: " + file_path)
    parsed_tika = parser.from_file(file_path)
    return parsed_tika["content"]

In [95]:
tikaTextExtractor(str(RAW_DIR / 'pdfs' / '2112.00861.pdf'))

Extracting text from file: data/raw/pdfs/2112.00861.pdf


'\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nA General Language Assistant\nas a Laboratory for Alignment\n\nAmanda Askell‚àó Yuntao Bai‚àó Anna Chen‚àó Dawn Drain‚àó Deep Ganguli‚àó Tom Henighan‚Ä†\n\nAndy Jones‚Ä† Nicholas Joseph‚Ä† Ben Mann‚àó Nova DasSarma Nelson Elhage\n\nZac Hatfield-Dodds Danny Hernandez Jackson Kernion Kamal Ndousse\n\nCatherine Olsson Dario Amodei Tom Brown Jack Clark Sam McCandlish Chris Olah\n\nJared Kaplan‚Ä°\n\nAnthropic\n\nAbstract\n\nGiven the broad capabilities of large language models, it should be possible to work towards\na general-purpose, text-based assistant that is aligned with human values, meaning that it is\nhelpful, honest, and harmless. As an initial foray in this direction we study simple baseline\ntechniques and evaluations, such as prompting. We find that the benefits from modest\ninterventions increase with model size, generalize to a variety of alignment evaluations,