## Analysis of AI documents and papers citation graph

In [23]:
import pandas as pd
citation_matches_df = pd.read_csv("../data/citation_graph/documents_references.csv", index_col=0, dtype={"id_source": str})

In [24]:
citation_matches_df.head(3)

Unnamed: 0,id_dest,id_source,doc_type,dest_title,source_title,dest_path,source_arxiv_url,match_type,matched_info,matched_text
0,24220,9512004.0,oecd,LITHUANIAN ARTIFICIAL INTELLIGENCE STRATEGY: A...,Natural language processing: she needs somethi...,24220.txt,http://arxiv.org/abs/cmp-lg/9512004,by_title,naturallanguageprocessingsheneedssomethingolda...,sconcernedtheareasoffocusaredeeplearningdatami...
1,24220,9709012.0,oecd,LITHUANIAN ARTIFICIAL INTELLIGENCE STRATEGY: A...,"Using Single Layer Networks for Discrete, Sequ...",24220.txt,http://arxiv.org/abs/cmp-lg/9709012,by_title,usingsinglelayernetworksfordiscretesequentiald...,sconcernedtheareasoffocusaredeeplearningdatami...
2,26722,1804.0166,oecd,AI TASK FORCE,The structure of evolved representations acros...,26722.txt,http://arxiv.org/abs/1804.01660,by_title,thestructureofevolvedrepresentationsacrossdiff...,sedinthisareaforexampletheunitedkingdomhasanof...


### Questions to answer
* Number of documents
* Number of papers
* Number of edges in the graph
* Number of matches by type
* Number of documents/papers with any link
* Most frequently cited papers
* What are the entries matched by arxiv id but not the title

In [25]:
import os
import json
OECD_DOCUMENT_COUNT = len(os.listdir("../data/oecd_docs/texts"))
NESTA_DOCUMENT_COUNT = len(os.listdir("../data/nesta_ai_governance_docs/texts"))
ARXIV_PAPERS_COUNT = len(json.load(open("../data/citation_graph/arxiv-ai-metadata.json", "r")))

print("OECD documents: {}".format(OECD_DOCUMENT_COUNT))
print("NESTA documents: {}".format(NESTA_DOCUMENT_COUNT))
print("ARXIV papers: {}".format(ARXIV_PAPERS_COUNT))

OECD documents: 84
NESTA documents: 102
ARXIV papers: 164105


In [26]:
citation_graph_df = citation_matches_df[['id_dest', 'id_source', 'doc_type']].drop_duplicates()
LINKS_COUNT = citation_graph_df.shape[0]
print("Number of links: {}".format(LINKS_COUNT))

Number of links: 502


In [27]:
# Matches by type
citation_matches_df['match_type'].value_counts()

by_title       487
by_arxiv_id    115
Name: match_type, dtype: int64

In [28]:
citation_graph_df[['id_dest', 'doc_type']].drop_duplicates()[['doc_type']].value_counts()

doc_type
nesta       48
oecd        32
dtype: int64

In [44]:
citation_graph_df[['id_dest', 'doc_type']].value_counts()

id_dest                                                            doc_type
the-malicious-use-of-ai-forecasting-prevention-and-mitigation      nesta       85
26955                                                              oecd        67
26961                                                              oecd        43
26746                                                              oecd        34
ai-and-national-security                                           nesta       24
                                                                               ..
national-strategy-for-artificial-intelligence                      nesta        1
opinion-of-the-german-data-ethics-commission                       nesta        1
discussion-paper-on-ai-and-personal-data                           nesta        1
26881                                                              oecd         1
2016-2019-progress-report-advancing-artificial-intelligence-randd  nesta        1
Length: 80, dtype: int

In [46]:
paper_counts = citation_graph_df[['id_source']].value_counts()

In [47]:
print("Number of cited papers: {}".format(len(paper_counts)))

Number of cited papers: 340


In [67]:
pd.DataFrame(paper_counts)

Unnamed: 0_level_0,0
id_source,Unnamed: 1_level_1
9512004,11
9709012,11
1810.07339,6
2005.11072,5
1711.00399,5
...,...
1810.00069,1
1810.03292,1
1810.08810,1
1811.00116,1


In [74]:
top_cited_papers_df = pd.merge(pd.DataFrame(paper_counts, columns=['count']).reset_index(), citation_matches_df[['id_source', 'source_title']].drop_duplicates(), 
         on='id_source', how='left')[['id_source', 'count', 'source_title']]

In [91]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None, 'display.width', 300):  # more options can be specified also
    print(top_cited_papers_df[top_cited_papers_df['count'] > 2])

     id_source  count                                                                                                                     source_title
0      9512004     11     Natural language processing: she needs something old and something new\n  (maybe something borrowed and something blue, too)
1      9709012     11                        Using Single Layer Networks for Discrete, Sequential Data: An Example\n  from Natural Language Processing
2   1810.07339      6                                                                       Security Matters: A Survey on Adversarial Machine Learning
3   2005.11072      5                                                               Regulating Artificial Intelligence: Proposal for a Global Solution
4   1711.00399      5                                   Counterfactual Explanations without Opening the Black Box: Automated\n  Decisions and the GDPR
5   2002.12620      5                                     TextBrewer: An Open-Source Knowledge

In [94]:
citation_graph_df.filter("doc_type == oecd")

0
1
2
3
5
...
596
597
599
600
601
