In [1]:
import polars as pl
import networkx as nx
import numpy as np

LOCAL_DATA_PATH = './data/History'

In [2]:
auth_net = pl.scan_csv(f'{LOCAL_DATA_PATH}/ssn_paper_authors.csv', separator=',')
cite_net = pl.scan_csv(f'{LOCAL_DATA_PATH}/ssn_paper_refs.csv', separator=',')

In [3]:
auth_net[:10].collect()

PaperID,AuthorID,AffiliationID,AuthorSequenceNumber
i64,i64,f64,i64
587241,2263987875,,1
587241,3211722442,,2
1022736,2618912518,,1
1246380,2673776004,,1
2795301,3119558885,,1
5159116,2104723406,134820265.0,1
5504347,2506040316,,1
5924287,2987485932,,1
8765881,2283045556,,1
8765881,2680658348,,2


In [4]:
cite_net.sort("CitationDate")[:10].collect()

Citing_PaperID,Cited_PaperID,CitationDate
i64,i64,str
2947083361,1977784628,"""2019-01-01"""
2991573790,2939648321,"""2019-01-01"""
2997498118,2940084900,"""2019-01-01"""
2997370306,2981802544,"""2019-01-01"""
2997370306,2946707887,"""2019-01-01"""
2997148192,2997092020,"""2019-01-01"""
2997148192,2943952542,"""2019-01-01"""
3111404574,2999915826,"""2019-01-01"""
3111067421,2911624371,"""2019-01-01"""
2997148192,376344434,"""2019-01-01"""


In [5]:
res = cite_net.join(auth_net, left_on = 'Citing_PaperID', right_on = 'PaperID', how = 'inner')
res = res.rename({"AuthorID": "Citing_AuthorID", "AffiliationID": "Citing_AffiliationID", "AuthorSequenceNumber": "Citing_AuthorSequenceNumber"})
res = res.join(auth_net, left_on = 'Cited_PaperID', right_on = 'PaperID', how = 'inner')
res = res.rename({"AuthorID": "Cited_AuthorID", "AffiliationID": "Cited_AffiliationID", "AuthorSequenceNumber": "Cited_AuthorSequenceNumber"})
res = res.sort("CitationDate")
res.collect()

Citing_PaperID,Cited_PaperID,CitationDate,Citing_AuthorID,Citing_AffiliationID,Citing_AuthorSequenceNumber,Cited_AuthorID,Cited_AffiliationID,Cited_AuthorSequenceNumber
i64,i64,str,i64,f64,i64,i64,f64,i64
2974912297,291320079,"""2019-01-01""",2995574826,,1,2119766304,,1
2967191196,376344434,"""2019-01-01""",2967420241,1.1701301e7,1,2118051611,,1
2997148192,376344434,"""2019-01-01""",2987284292,,1,2118051611,,1
2973048827,565054991,"""2019-01-01""",3173553483,883357.0,1,211881252,,1
2973048827,565054991,"""2019-01-01""",3173553483,883357.0,1,2946999897,,3
2973048827,565054991,"""2019-01-01""",3173553483,883357.0,1,3157821370,,2
2979444285,568305466,"""2019-01-01""",2287590710,,1,2032850205,,1
2969077039,569479482,"""2019-01-01""",2967383879,,4,2013158504,,2
2969077039,569479482,"""2019-01-01""",2968641274,,3,2013158504,,2
2969077039,569479482,"""2019-01-01""",2968675149,,1,2013158504,,2


In [8]:
# save the full table
res.sink_csv(f'{LOCAL_DATA_PATH}/ssn_author_ref_combined.csv')

In [9]:
# a simplied form with only the edge info
simplified_res = res.select(['Citing_PaperID', 'Cited_PaperID'])
simplified_res.sink_csv(f'{LOCAL_DATA_PATH}/simplified_ssn_author_ref_combined.csv')

In [2]:
from read_networks import read_combined_graph_from_csv
G_simple, G_init, _ = read_combined_graph_from_csv(field='History', graph=nx.Graph())
print(G_simple)
print(G_init)

      Citing_PaperID  Cited_PaperID CitationDate  Citing_AuthorID  \
0         2974912297      291320079   2019-01-01       2995574826   
1         2967191196      376344434   2019-01-01       2967420241   
2         2997148192      376344434   2019-01-01       2987284292   
3         2973048827      565054991   2019-01-01       3173553483   
4         2973048827      565054991   2019-01-01       3173553483   
...              ...            ...          ...              ...   
5875      2998056687     3039815431   2019-12-31       2297037455   
5876      2998056687     3039815431   2019-12-31       2297037455   
5877      2998056687     3039815431   2019-12-31       2297037455   
5878      2998056687     3039815431   2019-12-31       2297037455   
5879      2998056687     3039815431   2019-12-31       2297037455   

      Citing_AffiliationID  Citing_AuthorSequenceNumber  Cited_AuthorID  \
0                      NaN                            1      2119766304   
1               11701

In [3]:
import pandas as pd
pd.options.display.max_columns = 20
import numpy as np
rng = np.random.RandomState(seed=5)
ints = rng.randint(1, 11, size=(3, 2))
a = ["A", "B", "C"]
b = ["D", "A", "E"]
df = pd.DataFrame(ints, columns=["weight", "cost"])
df[0] = a
df["b"] = b
df[["weight", "cost", 0, "b"]]
G = nx.from_pandas_edgelist(df, 0, "b", ["weight", "cost"])

In [5]:
import pandas as pd
df = pd.read_csv('data/History/ssn_author_ref_combined.csv', sep=',')
print(df['Citing_PaperID'])

0        2974912297
1        2967191196
2        2997148192
3        2973048827
4        2973048827
            ...    
25144    3081119705
25145    3118959478
25146    3118959478
25147    3118959478
25148    3119629698
Name: Citing_PaperID, Length: 25149, dtype: int64
