# Test `semantic_api()`

In [24]:
import pandas as pd
import numpy as np
from datetime import date
from collections import Counter

from startupjh import utils
from startupjh import plots
from startupjh.data_collection import consolidated_df
from startupjh.data_preprocessing import data_preprocess
from startupjh.data_preprocessing import data_cleaning, data_enrichment
from startupjh.data_collection import semantic_api

import plotly.graph_objs as go
import plotly.express as px

%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Data collection for Semantic scholar

In [25]:
results_df, all_references_df, total_results, query = semantic_api.get_all_results_from_semantic_scholar()

Enter key words: mars biosignature


## Data visualization for Semantic scholar

### Basic plots

In [64]:
plots.make_access_pie(results_df, 'semantic_scholar')

In [65]:
plots.make_pub_per_year(results_df, 'semantic_scholar')

In [66]:
plots.make_citations_per_year(results_df, 'semantic_scholar')

In [67]:
plots.make_top_key_words(results_df, query)

#### Fields of Study

In [16]:
test_list = results_df.fieldsOfStudy.tolist()
res = [i for i in test_list if i]
flat_list_fields = utils.flatten_list(res)

In [18]:
most_common_fields = Counter(flat_list_fields).most_common()
most_common_fields_df = pd.DataFrame(most_common_fields, columns=["field", "occurence"])
most_common_fields_df

Unnamed: 0,field,occurence
0,Geology,19
1,Medicine,6
2,Environmental Science,5
3,Chemistry,3
4,Materials Science,1


In [21]:
def make_fields_pie(df):
    test_list = df.fieldsOfStudy.tolist()
    res = [i for i in test_list if i]
    flat_list_fields = utils.flatten_list(res)
    
    most_common_fields = Counter(flat_list_fields).most_common()
    most_common_fields_df = pd.DataFrame(most_common_fields, columns=["field", "occurence"])
    
    fig = px.pie(most_common_fields_df, values='occurence', names= 'field')

    fig.update_layout(
    title = "<span style='font-size: 22px;'><b>Fields of Study<b></span>", title_x=0.5,
    font=dict(
        family="Courier New, monospace",
        size=14,
        color="white"
    ),
    paper_bgcolor = "#101126",
    plot_bgcolor = "#101126")
    return fig

In [23]:
make_fields_pie(all_references_df)

#### Authors

In [27]:
authors_list = []
for index, row in results_df.iterrows():
    for dict_ in row.authors:
        authors_list.append(dict_['name'])
authors_list

['K. Campbell',
 'D. Guido',
 'J. Farmer',
 'M. V. Kranendonk',
 'S. Ruff',
 'F. Westall',
 'R. Summons',
 'J. Amend',
 'D. Bish',
 'R. Buick',
 'G. Cody',
 'D. D. Des Marais',
 'G. Dromart',
 'J. Eigenbrode',
 'A. Knoll',
 'D. Sumner',
 'C. Weisbin',
 'W. Lincoln',
 'D. Papanastassiou',
 'M. Coleman',
 'K. Campbell',
 'D. Guido',
 'J. Farmer',
 'M. V. Kranendonk',
 'S. Ruff',
 'F. Westall',
 'Joost W. Aerts',
 'A. Riedo',
 'Daniel J. Melton',
 'Simone Martini',
 'J. Flahaut',
 'U. Meierhenrich',
 'C. Meinert',
 'I. Myrgorodska',
 'R. Lindner',
 'P. Ehrenfreund',
 'A. Gangidine',
 'J. Havig',
 'A. Czaja',
 'F. Da Pieve',
 'G. Gronoff',
 'J. Guo',
 'C. Mertens',
 'L. Neary',
 'B. Gu',
 'N. Koval',
 'J. Kohanoff',
 'A. Vandaele',
 'F. Cleri',
 'M. Nachon',
 'R. Ewing',
 'M. Tice',
 'J. Stromberg',
 'A. Parkinson',
 'M. Morison',
 'E. Cloutis',
 'N. Casson',
 'D. Applin',
 'J. Poitras',
 'A. M. Martí',
 'C. Maggiori',
 'C. Cousins',
 'L. Whyte',
 'R. Kruzelecky',
 'D. Das',
 'R. Léveillé'

In [30]:
most_active_authors = Counter(authors_list).most_common()
most_active_authors_df = pd.DataFrame(most_active_authors, columns=["author", "occurence"])

In [31]:
most_active_authors_df

Unnamed: 0,author,occurence
0,J. Farmer,6
1,A. Czaja,4
2,S. Shkolyar,4
3,A. Gangidine,3
4,K. Campbell,2
...,...,...
157,A. Santo,1
158,J. Ferdosi,1
159,M. Konstantindis,1
160,K. Cote,1


In [36]:
def make_active_authors(df):
    authors_list = []
    for index, row in df.iterrows():
        for dict_ in row.authors:
            authors_list.append(dict_['name'])
    most_active_authors = Counter(authors_list).most_common()
    most_active_authors_df = pd.DataFrame(most_active_authors, columns=["author", "occurence"])
    fig = go.Figure(data=[go.Bar(x=most_active_authors_df[0:10].author,
                              y= most_active_authors_df[0:10].occurence,
                              texttemplate="%{y}",
                              textposition="outside",
                              textangle=0)])
    fig.update_layout(title = "<span style='font-size: 22px;'><b>Most active authors<b></span>", title_x=0.5,
                    font=dict(
                              family="Courier New, monospace",
                              size=12,
                              color="white"
    ),
    paper_bgcolor = "#101126",
    plot_bgcolor = "#101126")

    fig.update_traces(marker_color='#eda109')
    fig.update_xaxes(title="Authors")
    fig.update_yaxes(title="Number of Publications", range= [0, 1.1* most_active_authors_df.occurence.max()])
    return fig

In [37]:
make_active_authors(results_df)

#### Line charts

In [42]:
#results_df.groupby('year').count()['citationCount']
fig = px.line(results_df, x=results_df.groupby('year').count()['citationCount'].index,
              y=results_df.groupby('year').count()['citationCount'], title='Publications per year')
fig.update_layout(title = "<span style='font-size: 22px;'><b>Publications per Year<b></span>", title_x=0.5,
                    font=dict(
                              family="Courier New, monospace",
                              size=12,
                              color="white"
    ),
    paper_bgcolor = "#101126",
    plot_bgcolor = "#101126")
  
fig.update_traces(marker_color='#eda109')
fig.update_xaxes(title="Year", range= [results_df.year.min() - 5, date.today().year + 5])
fig.update_yaxes(title="Number of Publications", range= [0, 1.1* results_df.groupby('year').count()['citationCount'].max()])
fig.show()

### Network plots

In [7]:
def generate_ref_network_df(df1, df2):
    """df1 = all_references_df
     df2 = results_df"""
    ref1 = []
    ref2 = []
    for index, row in df1.iterrows():
        ref1.append(row.reference)
        ref2.append(df2.reference[df2.paperId == row['citedBy']])
    ref1_array = np.array(ref1)
    ref2_array = np.array(ref2)
    ref_network_df = pd.DataFrame(ref1_array, columns=['ref1'])
    ref_network_df['ref2'] = ref2_array
    return ref_network_df

In [35]:
def generate_graph_elements_network(df1, df2):
    ref_network_df = generate_ref_network_df(df1, df2)
    unique_refs = list(set(ref_network_df.ref1.unique().tolist()))
    unique_results = list(set(ref_network_df.ref2.unique().tolist()))
    #print(unique_results)
    nodes_refs = [{'data': {'id': unique_refs[0], 'label': unique_refs[0]}, 'classes': 'ref'}]
    nodes_results = [{'data': {'id': unique_results[0], 'label': unique_results[0]}, 'classes': 'res'}]
    nodes_list = nodes_refs + nodes_results
    for element in unique_refs[1:]:
        nodes_list.append({'data': {'id': element, 'label': element}, 'classes': 'ref'})
    for element in unique_results[1:]:
        nodes_list.append({'data': {'id': element, 'label': element}, 'classes': 'res'})
    print (nodes_list)
    edges_list = [{'data': {'source': ref_network_df['ref1'][0], 'target': ref_network_df['ref2'][0]}, 'classes': 'citation'}]
    for index, row in ref_network_df.iterrows():
        edges_list.append({'data': {'source': row.ref1, 'target': row.ref2}, 'classes': 'citation'})
    elements = nodes_list + edges_list
    #print(elements)
    return elements