# Test `semantic_api()`

In [70]:
import pandas as pd
import numpy as np
import requests
from datetime import date
from collections import Counter

from startupjh import utils
from startupjh import plots
from startupjh.data_collection import consolidated_df
from startupjh.data_preprocessing import data_preprocess
from startupjh.data_preprocessing import data_cleaning, data_enrichment
from startupjh.data_collection import semantic_api

import plotly.graph_objs as go
import plotly.express as px

%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data collection for Semantic scholar

In [4]:
results_df, all_references_df, total_results, query = semantic_api.get_all_results_from_semantic_scholar()

Enter key words: mars biosignature


In [5]:
results_df.head()

Unnamed: 0,paperId,url,title,abstract,venue,year,referenceCount,citationCount,influentialCitationCount,isOpenAccess,fieldsOfStudy,authors,reference,first_author_id,key_words
0,1935abaebef93ceb4b18eaf67c12618a001b26bb,https://www.semanticscholar.org/paper/1935abae...,TRACING HOT-SPRING FACIES AND THEIR GEOTHERMAL...,TEXTURES INTO THE TERRESTRIAL GEOLOGIC RECORD:...,,2016,0,1,1,False,,"[{'authorId': '4662741', 'name': 'K. Campbell'...",K. Campbell et al. (2016),4662741,"[tracing, hotspring, facies, geothermally, sil..."
1,958b808c4e32488bf9f484c7de0b928ddfe94cff,https://www.semanticscholar.org/paper/958b808c...,Preservation of martian organic and environmen...,The Mars Science Laboratory (MSL) has an instr...,Astrobiology,2011,117,215,7,True,"[Environmental Science, Medicine, Geology]","[{'authorId': '4945458', 'name': 'R. Summons'}...",R. Summons et al. (2011),4945458,"[preservation, martian, organic, environmental..."
2,61d144bd68693abd0ceafd1275969e8596b8c34b,https://www.semanticscholar.org/paper/61d144bd...,Mars Biosignature - Detection Capabilities: A ...,A Mars sample-return mission has been proposed...,,2013,7,3,0,True,[Environmental Science],"[{'authorId': '1751448', 'name': 'C. Weisbin'}...",C. Weisbin et al. (2013),1751448,"[mars, biosignature, detection, capabilities, ..."
3,fc7a04d99ce2f9bc8a9c177e6a6cdbebe842641f,https://www.semanticscholar.org/paper/fc7a04d9...,Tracing Hot-Spring Facies and thier Geothermal...,,,2016,0,0,0,False,[Geology],"[{'authorId': '4662741', 'name': 'K. Campbell'...",K. Campbell et al. (2016),4662741,"[tracing, hotspring, facies, thier, geothermal..."
4,f1376f87575a361568df4b3d891b5e6388519cfd,https://www.semanticscholar.org/paper/f1376f87...,Biosignature Analysis of Mars Soil Analogs fro...,The detection of biosignatures on Mars is of o...,Astrobiology,2020,90,6,0,True,"[Medicine, Environmental Science]","[{'authorId': '32793236', 'name': 'Joost W. Ae...",Joost W. Aerts et al. (2020),32793236,"[biosignature, analysis, mars, soil, analogs, ..."


## Data visualization for Semantic scholar

### Basic plots

In [64]:
plots.make_access_pie(results_df, 'semantic_scholar')

In [65]:
plots.make_pub_per_year(results_df, 'semantic_scholar')

In [66]:
plots.make_citations_per_year(results_df, 'semantic_scholar')

In [67]:
plots.make_top_key_words(results_df, query)

#### Fields of Study

In [16]:
test_list = results_df.fieldsOfStudy.tolist()
res = [i for i in test_list if i]
flat_list_fields = utils.flatten_list(res)

In [18]:
most_common_fields = Counter(flat_list_fields).most_common()
most_common_fields_df = pd.DataFrame(most_common_fields, columns=["field", "occurence"])
most_common_fields_df

Unnamed: 0,field,occurence
0,Geology,19
1,Medicine,6
2,Environmental Science,5
3,Chemistry,3
4,Materials Science,1


In [21]:
def make_fields_pie(df):
    test_list = df.fieldsOfStudy.tolist()
    res = [i for i in test_list if i]
    flat_list_fields = utils.flatten_list(res)
    
    most_common_fields = Counter(flat_list_fields).most_common()
    most_common_fields_df = pd.DataFrame(most_common_fields, columns=["field", "occurence"])
    
    fig = px.pie(most_common_fields_df, values='occurence', names= 'field')

    fig.update_layout(
    title = "<span style='font-size: 22px;'><b>Fields of Study<b></span>", title_x=0.5,
    font=dict(
        family="Courier New, monospace",
        size=14,
        color="white"
    ),
    paper_bgcolor = "#101126",
    plot_bgcolor = "#101126")
    return fig

In [23]:
make_fields_pie(all_references_df)

#### Authors

In [27]:
authors_list = []
for index, row in results_df.iterrows():
    for dict_ in row.authors:
        authors_list.append(dict_['name'])
authors_list

['K. Campbell',
 'D. Guido',
 'J. Farmer',
 'M. V. Kranendonk',
 'S. Ruff',
 'F. Westall',
 'R. Summons',
 'J. Amend',
 'D. Bish',
 'R. Buick',
 'G. Cody',
 'D. D. Des Marais',
 'G. Dromart',
 'J. Eigenbrode',
 'A. Knoll',
 'D. Sumner',
 'C. Weisbin',
 'W. Lincoln',
 'D. Papanastassiou',
 'M. Coleman',
 'K. Campbell',
 'D. Guido',
 'J. Farmer',
 'M. V. Kranendonk',
 'S. Ruff',
 'F. Westall',
 'Joost W. Aerts',
 'A. Riedo',
 'Daniel J. Melton',
 'Simone Martini',
 'J. Flahaut',
 'U. Meierhenrich',
 'C. Meinert',
 'I. Myrgorodska',
 'R. Lindner',
 'P. Ehrenfreund',
 'A. Gangidine',
 'J. Havig',
 'A. Czaja',
 'F. Da Pieve',
 'G. Gronoff',
 'J. Guo',
 'C. Mertens',
 'L. Neary',
 'B. Gu',
 'N. Koval',
 'J. Kohanoff',
 'A. Vandaele',
 'F. Cleri',
 'M. Nachon',
 'R. Ewing',
 'M. Tice',
 'J. Stromberg',
 'A. Parkinson',
 'M. Morison',
 'E. Cloutis',
 'N. Casson',
 'D. Applin',
 'J. Poitras',
 'A. M. Martí',
 'C. Maggiori',
 'C. Cousins',
 'L. Whyte',
 'R. Kruzelecky',
 'D. Das',
 'R. Léveillé'

In [30]:
most_active_authors = Counter(authors_list).most_common()
most_active_authors_df = pd.DataFrame(most_active_authors, columns=["author", "occurence"])

In [31]:
most_active_authors_df

Unnamed: 0,author,occurence
0,J. Farmer,6
1,A. Czaja,4
2,S. Shkolyar,4
3,A. Gangidine,3
4,K. Campbell,2
...,...,...
157,A. Santo,1
158,J. Ferdosi,1
159,M. Konstantindis,1
160,K. Cote,1


In [36]:
def make_active_authors(df):
    authors_list = []
    for index, row in df.iterrows():
        for dict_ in row.authors:
            authors_list.append(dict_['name'])
    most_active_authors = Counter(authors_list).most_common()
    most_active_authors_df = pd.DataFrame(most_active_authors, columns=["author", "occurence"])
    fig = go.Figure(data=[go.Bar(x=most_active_authors_df[0:10].author,
                              y= most_active_authors_df[0:10].occurence,
                              texttemplate="%{y}",
                              textposition="outside",
                              textangle=0)])
    fig.update_layout(title = "<span style='font-size: 22px;'><b>Most active authors<b></span>", title_x=0.5,
                    font=dict(
                              family="Courier New, monospace",
                              size=12,
                              color="white"
    ),
    paper_bgcolor = "#101126",
    plot_bgcolor = "#101126")

    fig.update_traces(marker_color='#eda109')
    fig.update_xaxes(title="Authors")
    fig.update_yaxes(title="Number of Publications", range= [0, 1.1* most_active_authors_df.occurence.max()])
    return fig

In [37]:
make_active_authors(results_df)

#### Line charts

In [42]:
#results_df.groupby('year').count()['citationCount']
fig = px.line(results_df, x=results_df.groupby('year').count()['citationCount'].index,
              y=results_df.groupby('year').count()['citationCount'], title='Publications per year')
fig.update_layout(title = "<span style='font-size: 22px;'><b>Publications per Year<b></span>", title_x=0.5,
                    font=dict(
                              family="Courier New, monospace",
                              size=12,
                              color="white"
    ),
    paper_bgcolor = "#101126",
    plot_bgcolor = "#101126")
  
fig.update_traces(marker_color='#eda109')
fig.update_xaxes(title="Year", range= [results_df.year.min() - 5, date.today().year + 5])
fig.update_yaxes(title="Number of Publications", range= [0, 1.1* results_df.groupby('year').count()['citationCount'].max()])
fig.show()

### Network plots

In [54]:
def generate_collab_network_df(df):
    authors_list_of_list = []
    ids_list_of_list = []
    for index, row in df.iterrows():
        authors_list = []
        ids_list = []
        for dict_ in row.authors:
            authors_list.append(dict_['name'])
            ids_list.append(dict_['authorId'])
        authors_list_of_list.append(authors_list)
        ids_list_of_list.append(ids_list)
    authors_combinations = []
    ids_combinations = []
    for authors in authors_list_of_list:
        res = [(a, b) for idx, a in enumerate(authors) for b in authors[idx + 1:]]
        authors_combinations.append(res)
    for ids in ids_list_of_list:
        rex = [(a, b) for idx, a in enumerate(ids) for b in ids[idx + 1:]]
        ids_combinations.append(rex)
    flat_authors_combinations = utils.flatten_list(authors_combinations)
    flat_ids_combinations = utils.flatten_list(ids_combinations)
    most_common_collab = Counter(flat_authors_combinations).most_common(50)
    most_common_collab_ids = Counter(flat_ids_combinations).most_common(50)
    unpacked_most_collab = [(a, b, c) for (a, b ), c in most_common_collab]
    unpacked_most_collab_ids = [(a, b, c) for (a, b ), c in most_common_collab_ids]
    #print(unpacked_most_collab_ids)
    nx_df = pd.DataFrame(unpacked_most_collab, columns=['author1', 'author2', 'weight'])
    nx_id_df = pd.DataFrame(unpacked_most_collab_ids, columns=['id1', 'id2', 'weight1'])
    collabs_df = pd.concat([nx_df, nx_id_df], axis=1)
    collabs_df['author1'] = list(zip(collabs_df.author1, collabs_df.id1))
    collabs_df['author2'] = list(zip(collabs_df.author2, collabs_df.id2))
    collabs_df.drop(['id1', 'id2', 'weight1'], axis = 1, inplace = True)
    return collabs_df

In [63]:
def generate_graph_elements_collab(df):
    nx_df = generate_collab_network_df(df)
    unique_top_authors = list(set(nx_df.author1.unique().tolist() + nx_df.author2.unique().tolist()))
    nodes_list = [{'data': {'id': unique_top_authors[0][1], 'label': unique_top_authors[0][0]}, 'classes': 'author'}]
    for element in unique_top_authors[1:]:
        nodes_list.append({'data': {'id': element[1], 'label': element[0]}, 'classes': 'author'})
    edges_list = [{'data': {'source': nx_df['author1'][0][1], 'target': nx_df['author2'][0][1]}, 'classes': 'collaboration'}]
    for index, row in nx_df.iterrows():
        edges_list.append({'data': {'source': row.author1[1], 'target': row.author2[1]}, 'classes': 'collaboration'})
    elements = nodes_list + edges_list
    #print(elements)
    return elements

In [118]:
def generate_ref_network_df(df1, df2):
    """df1 = all_references_df
     df2 = results_df"""
    ref1 = []
    ref2 = []
    for index, row in df1.iterrows():
        ref1.append((row.reference,row['paperId']))
        ref2.append(("".join(df2.reference[df2.paperId == row['citedBy']]), row['citedBy']))
    ref_network_df = pd.DataFrame(
    {'ref1': ref1,
     'ref2': ref2
    })
    return ref_network_df

In [133]:
def generate_graph_elements_network(df1, df2):
    ref_network_df = generate_ref_network_df(df1, df2)
    unique_refs = list(set(ref_network_df.ref1.unique().tolist()))
    unique_results = list(set(ref_network_df.ref2.unique().tolist()))
    print(unique_results)
    nodes_refs = [{'data': {'id': unique_refs[0][1], 'label': unique_refs[0][0], 'classes': 'ref'}}]
    nodes_results = [{'data': {'id': unique_results[0][1], 'label': unique_results[0][0], 'classes': 'res'}}]
    nodes_list = nodes_refs + nodes_results
    for element in unique_refs[1:]:
        nodes_list.append({'data': {'id': element[1], 'label': element[0], 'classes': 'ref'}})
    for element in unique_results[1:]:
        nodes_list.append({'data': {'id': element[1], 'label': element[0], 'classes': 'res'}})
    edges_list = [{'data': {'source': ref_network_df['ref1'][0][1], 'target': ref_network_df['ref2'][0][1]}, 'classes': 'citation'}]
    for index, row in ref_network_df.iterrows():
        edges_list.append({'data': {'source': row.ref1[1], 'target': row.ref2[1]}, 'classes': 'citation'})
    elements = nodes_list + edges_list
    return elements

In [134]:
generate_graph_elements_network(all_references_df, results_df)

[('L. Hays et al. (2017)', '0e5ab9a0d7e84b33e627f655777abc5bd3eda272'), ('L. Hays, D. Beaty (2017)', 'be0a6eb06af6231edf9378247ac9e154add68770'), ('S. Shkolyar, J. Farmer (2018)', '9a40cc604ec5720a627103a982df6089fa6298f6'), ('F. Da Pieve et al. (2020)', '15f56931cbfc6a0e2296be1cb9fd4d8fe4217781'), ('Jason E. French, D. Blake (2016)', '8bd506fadaa7c5b5fdfa66d1c61f408395a08067'), ('S. Shkolyar et al. (2021)', 'a491bc56c596f169dedd8eebc2346a987b277dcc'), ('C. Weisbin et al. (2013)', '61d144bd68693abd0ceafd1275969e8596b8c34b'), ('Joost W. Aerts et al. (2020)', 'f1376f87575a361568df4b3d891b5e6388519cfd'), ('J. Stromberg et al. (2013)', '36d23ffa810754651adb7ef60b52c814ef45d993'), ('J. Stromberg et al. (2019)', '73e782e5532960f9c54cf2219ffdff0afbe5c308'), ('M. Floyd et al. (2018)', 'e9040c638702931f009145cf6126874672585e52'), ('R. Summons et al. (2011)', '958b808c4e32488bf9f484c7de0b928ddfe94cff')]


[{'data': {'id': '693dd2f835afd4eec2020f27f60e551c91ef4034',
   'label': 'K. Lalonde et al. (2012)',
   'classes': 'ref'}},
 {'data': {'id': '0e5ab9a0d7e84b33e627f655777abc5bd3eda272',
   'label': 'L. Hays et al. (2017)',
   'classes': 'res'}},
 {'data': {'id': '71bb4b88cef499cd62af8766123de489bf870417',
   'label': 'I. Reid et al. (2006)',
   'classes': 'ref'}},
 {'data': {'id': 'df23a6cb74e89fa0e979c4d1a7cc76a94577414a',
   'label': 'S. Shkolyar et al. (2018)',
   'classes': 'ref'}},
 {'data': {'id': '7f15ea35e9181b230fad207c44ee106bb84c982d',
   'label': 'R. B. Norman et al. (2016)',
   'classes': 'ref'}},
 {'data': {'id': '14ac33a1fb626040f3019947bfe95903ebe46d99',
   'label': 'B. Schubert et al. (2009)',
   'classes': 'ref'}},
 {'data': {'id': '039048dfde63b3504289cd4204cef34f83f2e675',
   'label': 'C. Cockell, A. Herrera (2008)',
   'classes': 'ref'}},
 {'data': {'id': '680b0bfaebae4fdb7daf4bba404229a4d9cdd75b',
   'label': 'A. Lambert et al. (1998)',
   'classes': 'ref'}},
 {'da

In [129]:
def get_paper_info(paper_id):
    url = f"https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields=paperId,title,url,abstract,venue,year,referenceCount,citationCount,isOpenAccess,fieldsOfStudy"
    response = requests.get(url).json()
    return response