In [1]:
import networkx as nx
import numpy as np
import polars as pl
import powerlaw
import matplotlib.pyplot as plt
from scipy.stats import lognorm
from collections import Counter
from itertools import combinations

DATA_PATH = '/data'

# From server to local

In [None]:
# SciSciNet
ssn_auth_lazy = pl.scan_csv(f"{DATA_PATH}/sciscinet/SciSciNet_Authors.tsv", separator='\t')
ssn_paper_lazy = pl.scan_csv(f"{DATA_PATH}/sciscinet/SciSciNet_Papers.tsv", separator='\t')
ssn_paper_auth_affil_lazy = pl.scan_csv(f"{DATA_PATH}/sciscinet/SciSciNet_PaperAuthorAffiliations.tsv", separator='\t')
ssn_paper_ref_lazy = pl.scan_csv(f'{DATA_PATH}/sciscinet/SciSciNet_PaperReferences_Date.tsv', separator='\t')

In [None]:
# paperID - FieldID
ssn_paper_field_lazy = pl.scan_csv(f"{DATA_PATH}/sciscinet/SciSciNet_PaperFields.tsv", separator='\t')
ssn_paper_field_lazy[:5].collect()

PaperID,FieldID,Hit_1pct,Hit_5pct,Hit_10pct,C_f
i64,i64,f64,f64,f64,f64
18936224,71924100,0.0,0.0,0.0,0.731404
94748468,144024400,0.0,0.0,0.0,0.0
113998959,71924100,0.0,0.0,0.0,0.270789
56740461,144133560,0.0,0.0,0.0,0.328162
23,71924100,0.0,0.0,0.0,0.169123


In [None]:
# filed
ssn_fields = pl.read_csv(f"{DATA_PATH}/sciscinet/SciSciNet_Fields.tsv", separator='\t') 
ssn_fields.filter(pl.col('Field_Type') == 'Top')

FieldID,Field_Name,Field_Type
i64,str,str
95457728,"""History""","""Top"""
127313418,"""Geology""","""Top"""
162324750,"""Economics""","""Top"""
205649164,"""Geography""","""Top"""
185592680,"""Chemistry""","""Top"""
138885662,"""Philosophy""","""Top"""
144024400,"""Sociology""","""Top"""
192562407,"""Materials scie…","""Top"""
33923547,"""Mathematics""","""Top"""
86803240,"""Biology""","""Top"""


In [None]:
FIELD_ID_Physics = 121332964
FIELD_ID_History = 95457728

In [None]:
def extract_papers(pids: list, year_min: int, year_max: int) -> pl.DataFrame:
    ssn_paper_phy = ssn_paper_lazy.filter(
        pl.col("PaperID").is_in(pids)
    ).filter(
        (pl.col("Year") >= year_min) & (pl.col("Year") <= year_max)
    ).select([
        'PaperID', 'Year', 'Citation_Count', 'C10', 
        'Reference_Count', 'C5', 'Team_Size', 
        'Institution_Count',  'Newsfeed_Count'
    ]).collect()

    return ssn_paper_phy

def extract_citation_info(pids: list) -> pl.DataFrame:
    ssn_paper_ref_selected = ssn_paper_ref_lazy.filter(
        (pl.col('Citing_PaperID').is_in(pids)) & (pl.col('Cited_PaperID').is_in(pids))
    ).collect()

    return ssn_paper_ref_selected

def extract_collaboration_info(pids: list) -> pl.DataFrame:
    ssn_paper_auth_affil_selected = ssn_paper_auth_affil_lazy.filter(
        pl.col('PaperID').is_in(pids)
    ).collect()

    return ssn_paper_auth_affil_selected

In [None]:
FIELDS = [FIELD_ID_Physics, FIELD_ID_History]
FIELD_NAMES = ['Physics', 'History']

YEAR_MIN = 2019
YEAR_MAX = 2020

for field_id, field_name in zip(FIELDS, FIELD_NAMES):
    # extract paper ids
    ssn_paper_field_selected = ssn_paper_field_lazy.filter(
        pl.col('FieldID') == field_id
    ).collect()
    pid_selected = ssn_paper_field_selected['PaperID'].to_list()

    # get paper info
    ssn_paper_phy = extract_papers(pid_selected, YEAR_MIN, YEAR_MAX)
    print(ssn_paper_phy.shape)
    pid_selected_by_year = ssn_paper_phy['PaperID'].to_list()

    # get citation info
    ssn_paper_ref_selected = extract_citation_info(pid_selected_by_year)
    print(ssn_paper_ref_selected.shape)

    # get collaboration info
    ssn_paper_auth_affil_selected = extract_collaboration_info(pid_selected_by_year)
    print(ssn_paper_auth_affil_selected.shape)

    # save to csv
    ssn_paper_phy.write_csv(f"{DATA_PATH}/kk929/network/{field_name}/papers.csv")
    ssn_paper_ref_selected.write_csv(f"{DATA_PATH}/kk929/network/{field_name}/ssn_paper_refs.csv")
    ssn_paper_auth_affil_selected.write_csv(f"{DATA_PATH}/kk929/network/{field_name}/ssn_paper_authors.csv")
    

(388157, 9)
(549666, 3)
(3099280, 4)
(164703, 9)
(6088, 3)
(231146, 4)


# Create citation network

In [2]:
# read
FIELD_NAMES = ['Physics', 'History']
field_name = FIELD_NAMES[0]  # in case of Physics


In [3]:
gs = {}
for field_name in FIELD_NAMES:
    # read data
    paper_ref = pl.read_csv(f'{DATA_PATH}/kk929/network/{field_name}/ssn_paper_refs.csv')  # please fix the path

    # create directed graph
    g_citation = nx.DiGraph()
    g_citation.add_edges_from(paper_ref.select(['Citing_PaperID', 'Cited_PaperID']).to_numpy())

    # get largest connected component
    lcc_citation = max(nx.weakly_connected_components(g_citation), key=len)
    g_citation_lcc = g_citation.subgraph(lcc_citation)

    # add to dict
    gs[field_name] = g_citation_lcc

In [4]:
for f, g in gs.items():
    print(f, g.number_of_nodes(), g.number_of_edges())

Physics 148769 529272
History 902 968


In [9]:
# save to file
for f, g in gs.items():
    nx.write_edgelist(g, f'{DATA_PATH}/kk929/network/{f}/citation_lcc.edgelist', data=False)