In [None]:
import pandas as pd
import json as js
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import utils as u

sns.set()

# 1. Fields of Study

## 1.1. Preparing Fields

### 1.1.1. Extracting FieldsOfStudy

In [None]:
! tar -I pixz -xvf data/FieldsOfStudy.tar.xz --directory data/
#! tar -xvf data/FieldsOfStudy.tar.xz --directory data/
! rm data/FieldsOfStudy.tar.xz
raw_fields = pd.read_table("data/FieldsOfStudy.txt")[["FieldOfStudyId", "NormalizedName", "Level"]]
raw_fields['NormalizedName'].fillna("na", inplace = True)
fields = raw_fields[raw_fields["Level"] == 0][["FieldOfStudyId", "NormalizedName"]]

### 1.1.2. Extracting Subfield Data

In [None]:
#! tar -I pixz -xvf data/FieldOfStudyChildren.tar.xz --directory data/
! tar -xvf data/FieldOfStudyChildren.tar.xz --directory data/
! rm data/FieldOfStudyChildren.tar.xz
children = pd.read_table("data/FieldOfStudyChildren.txt")

subfields = []
for id in fields["FieldOfStudyId"]:
    subs = np.empty(0, dtype=np.int64)
    curr = np.array([id], dtype = np.int64)
    while curr.size != 0:
        tmp = children[children["FieldOfStudyId"].isin(curr)]["ChildFieldOfStudyId"]
        subs = np.append(subs, curr)
        curr = tmp
    subfields.append(subs)
fields["Subfields"] = np.array(subfields)
fields.columns = ['FieldId', 'Field', 'Subfields']

# The "computational" fields will be needed in the future
cs_subfields = fields[fields["Field"] == "computer science"]["Subfields"].values[0]
computational = raw_fields[raw_fields['NormalizedName'].str.contains("comp( |ut)")]
computational = computational[~computational['NormalizedName'].str.contains("tomograph")]  # CT scans happen
computational = [f for f in computational['FieldOfStudyId'] if f not in cs_subfields ]
fields = fields.append(
    {"FieldId":-1, "Field":"computational", "Subfields":np.array(computational)}, 
    ignore_index=True)

fields.reset_index(drop=True).to_feather("data/Fields")
! rm data/FieldsOfStudy.txt
! rm data/FieldOfStudyChildren.txt

## 1.2. Reducing PaperFieldsOfStudy Data

### 1.2.1. Extracting PaperFieldsOfStudy Data

In [None]:
#! tar  -I pixz -xvf data/PaperFieldsOfStudy.tar.xz --directory data/
! tar -xvf data/PaperFieldsOfStudy.tar.xz --directory data/
! rm data/PaperFieldsOfStudy.tar.xz
! mkdir data/PaperFieldsOfStudy
! split -C 2G data/PaperFieldsOfStudy.txt data/PaperFieldsOfStudy/PaperFieldsOfStudy.

In [None]:
# Understanding the strength of the associations
schema = u.load_schema("PaperFieldsOfStudy")
scores = np.empty(0, dtype = np.float16)

for part in os.listdir("data/PaperFieldsOfStudy"):
    part_scores = pd.read_table("data/PaperFieldsOfStudy/"+part, names = schema)["Score"].values.astype('float16')
    scores = np.append(scores, part_scores)

fig, ax = plt.subplots()
ax.hist(scores, bins=40)
ax.set_xlabel("Field Score")
ax.set_ylabel("Number of Associations\n(100 millions)")
fig.savefig("plots/FieldScoresDistribution.png")

### 1.2.2. Reducing Fields to the Top Level

In [None]:
subf_papers = set([])
comp_papers = set([])
fields = pd.read_feather("data/Fields")
comp_fields = fields[fields["Field"] == "computational"]["Subfields"].values[0]
fields = fields[fields["Field"] != "computational"]
top_dict = {key:fields[fields["FieldId"] == key]["Subfields"].values[0] for key in fields["FieldId"].values}
cs_id = fields[fields["Field"] == "computer science"]["FieldId"].values[0]
schema = u.load_schema("PaperFieldsOfStudy")
schema[1] = "FieldId"


for part in os.listdir("data/PaperFieldsOfStudy"):
    tmp = pd.read_table("data/PaperFieldsOfStudy/"+part, names = schema)
    # Eliminate score outliers, i.e. associations with too low scores
    tmp = tmp[tmp["Score"] > .05][["FieldId", "PaperId"]]
    # Set aside these target papers
    comp_papers.update(tmp[tmp['FieldId'].isin(comp_fields)]['PaperId'].unique().tolist())
    # Convert all fields to the top field, taking care of intersections
    current = []
    for top in top_dict.keys():
        papers = tmp[tmp["FieldId"].isin(top_dict[top])]["PaperId"].unique()
        current.append(pd.DataFrame({"PaperId":papers,"FieldId":top}))
        # Set aside these target papers
        if top == cs_id:
            subf_papers.update(papers.tolist())
    pd.concat(current).reset_index(drop=True).to_feather("data/PaperFieldsOfStudy/"+part)

conditions = ["CS"]*len(subf_papers) + ["comp"]*len(comp_papers)
papers = list(subf_papers) + list(comp_papers)
pd.DataFrame({"PaperId":np.array(papers), "Condition":conditions}).to_feather("data/CitedPapers")

! rm data/PaperFieldsOfStudySchema.txt

## 1.3. Splitting by Field

In [None]:
fields = pd.read_feather("data/Fields")
parts = os.listdir("data/PaperFieldsOfStudy")
for field in fields["FieldId"].unique()[:-1]:
    fi_papers = []
    for part in parts:
        tmp = pd.read_feather("data/PaperFieldsOfStudy/"+part)
        fi_papers.append(tmp[tmp["FieldId"] == field][["PaperId"]])
    tmp = pd.concat(fi_papers).drop_duplicates()
    tmp.reset_index(drop = True).to_feather("data/PaperFieldsOfStudy/"+str(field))

! rm data/PaperFieldsOfStudy/Paper*
! mv data/PaperFieldsOfStudy/ data/PaperFields

# 2. Papers

## 2.1. Extracting Paper Data

In [None]:
! tar -xvf data/Papers.tar.xz --directory data/
#! tar -I pixz -xvf data/Papers.tar.xz --directory data/
! rm data/Papers.tar.xz
! mkdir data/Papers
! split -C 3G data/Papers.txt data/Papers/Papers.

## 2.2. Pruning Columns

In [None]:
schema = u.load_schema("Papers")

for part in os.listdir("data/Papers"):
    tmp = pd.read_table("data/Papers/"+part, names = schema, 
        usecols = ["PaperId", "PaperTitle", "Date", "OnlineDate"])
    tmp.drop_duplicates(["PaperTitle", "Date"], inplace=True)
    tmp[["PaperId", "Date", "OnlineDate"]].reset_index(drop=True).to_feather("data/Papers/"+part)

# NOTE: duplicates appear
# NOTE: DocSubTypes is not in the dataset; it needs to be taken out from PapersSchema.txt
! rm data/PapersSchema.txt
! rm data/Papers.txt

## 2.3. Fixing Dates

In [None]:
no_date = 0
for part in os.listdir("data/Papers"):
    tmp = pd.read_feather("data/Papers/"+part)
    tmp["Date"] = pd.to_datetime(tmp["Date"])
    tmp[tmp["Date"].notna()][["PaperId", "Date"]].reset_index(drop=True).to_feather("data/Papers/"+part)
    no_date += len(tmp[tmp["Date"].isna()])

print(no_date) # 6679

# 3. PaperReferences

## 3.1. Extracting PaperReferences

In [None]:
! tar -xvf data/PaperReferences.tar.xz --directory data/
#! tar -I pixz -xvf data/PaperReferences.tar.xz --directory data/
! rm data/PaperReferences.tar.xz
! mkdir data/PaperReferences
! split -C 2G data/PaperReferences.txt data/PaperReferences/PaperReferences.

In [None]:
# Serialize
schema = u.load_schema("PaperReferences")

for part in os.listdir("data/PaperReferences"):
    tmp = pd.read_table("data/PaperReferences/"+part, names = schema)
    tmp.to_feather("data/PaperReferences/"+part)

! rm data/PaperReferencesSchema.txt

## 3.2. Extracting References

In [None]:
! mkdir data/References

fields = pd.read_feather("data/Fields")["FieldId"].unique()[:-1]
parts = os.listdir("data/PaperReferences")
for field in fields:
    papers = pd.read_feather("data/PaperFields/"+str(field))["PaperId"].values
    refs = []
    for part in parts:
        tmp = pd.read_feather("data/PaperReferences/"+part)
        tmp = tmp[tmp["PaperId"].isin(papers)][["PaperId"]]
        tmp["References"] = 1
        refs.append(tmp.groupby("PaperId").sum().reset_index()) # Required for memory saving
    refs = pd.concat(refs)
    refs = refs.groupby("PaperId").sum().reset_index()
    refs.to_feather("data/References/"+str(field))

In [None]:
# With low processing, power, the following can be performed "in log form" separately (i.e. recursively 2 by 2)
overall = []
for part in os.listdir("data/References"):
    overall.append(pd.read_feather("data/References/"+part))
pd.concat(overall).drop_duplicates().reset_index().to_feather("data/References/0")

## 3.3. Extracting Citations

### 3.3.1. Pruning Unneeded References

In [None]:
papers = pd.read_feather("data/CitedPapers")["PaperId"].unique()
no_paper_data = 0
for part in os.listdir("data/PaperReferences"):
    tmp = pd.read_feather("data/PaperReferences/"+part)
    pre = len(tmp)
    tmp = tmp[tmp["PaperReferenceId"].isin(papers)]
    no_paper_data += (pre - len(tmp))
    tmp.reset_index(drop=True).to_feather("data/PaperReferences/"+part)

# The references contained a number of papers
# for which no data was available in the Papers file
print(no_paper_data) # 1259719004

### 3.3.2. Splitting in Fields

In [None]:
! mkdir data/Citations

fields = pd.read_feather("data/Fields")["FieldId"].unique()[:-1]
tmp = pd.read_feather("data/CitedPapers")
cs = tmp[tmp["Condition"] == "CS"]["PaperId"].values
comp = tmp[tmp["Condition"] == "comp"]["PaperId"].values

parts = os.listdir("data/PaperReferences")
for field in fields:
    papers = pd.read_feather("data/PaperFields/"+str(field))["PaperId"].values
    cs_cits, comp_cits = [], []

    for part in parts:
        tmp = pd.read_feather("data/PaperReferences/"+part)
        tmp = tmp[tmp["PaperId"].isin(papers)]
        # CS condition
        csc = tmp[tmp["PaperReferenceId"].isin(cs)][["PaperId"]]
        csc["Citations"] = 1
        cs_cits.append(csc.groupby("PaperId").sum().reset_index()) # Required for memory saving
        # comp condition
        compc = tmp[tmp["PaperReferenceId"].isin(comp)][["PaperId"]]
        compc["Citations"] = 1
        comp_cits.append(compc.groupby("PaperId").sum().reset_index()) # Required for memory saving
        # all condition
        tmp = tmp[["PaperId"]]
        tmp["Citations"] = 1
    
    cs_cits = pd.concat(cs_cits).groupby("PaperId").sum().reset_index()
    cs_cits.to_feather("data/Citations/"+str(field)+"_CS")
    comp_cits = pd.concat(comp_cits).groupby("PaperId").sum().reset_index()
    comp_cits.to_feather("data/Citations/"+str(field)+"_Comp")

In [None]:
# With low processing, power, the following can be performed "in log form" separately (i.e. recursively 2 by 2)
overall = []
for field in fields:
    overall.append(pd.read_feather("data/Citations/"+str(field)+"_CS"))
pd.concat(overall).drop_duplicates().reset_index().to_feather("data/Citations/0_CS")

overall = []
for field in fields:
    overall.append(pd.read_feather("data/Citations/"+str(field)+"_Comp"))
pd.concat(overall).drop_duplicates().reset_index().to_feather("data/Citations/0_Comp")

### 3.3.3. Outliers and Cleaning

In [None]:
overall = pd.read_feather("data/References/0")
out_no = len(overall)
overall = overall[overall["References"] < 301]["References"].values
out_no -= len(overall)

fig, ax = plt.subplots()
ax.hist(overall, bins=100)
ax.set_xlabel("Number of References")
ax.set_ylabel("Number of Papers\n(10 millions)")
fig.savefig("images/ReferenceNumberDistribution300.png")

# There are some outliers that are
# recorded having more than 300 references:
print(out_no) # 106695
# Remaining:
print(len(overall)) # 82961477

In [None]:
overall = pd.concat(
    pd.read_feather("data/Citations/0_CS")[["Citations"]],
    pd.read_feather("data/Citations/0_Comp")[["Citations"]])
out_no = len(overall)
overall = overall[overall["Citations"] < 1001]["Citations"].unique().values
out_no -= len(overall)

fig, ax = plt.subplots()
ax.hist(overall, bins=100)
ax.set_xlabel("Number of Citations")
ax.set_ylabel("Number of Papers\n(10 millions)")
fig.savefig("images/CitationsNumberDistribution1000.png")

# There are some outliers that are 
# recorded being cited more than 1000 times:
print(out_no) # 1805
# Remaining:
print(len(overall)) # 64130026

In [None]:
for part in os.listdir("data/References"):
    tmp = pd.read_feather("data/References/"+part)
    tmp = tmp[tmp["References"] < 301]
    tmp.reset_index(drop = True).to_feather("data/References/"+part)

for part in os.listdir("data/Citations"):
    tmp = pd.read_feather("data/Citations/"+part)
    tmp = tmp[tmp["Citations"] < 1001]
    tmp.reset_index(drop = True).to_feather("data/Citations/"+part)

# 4. Time Series

## 4.1. Preparing Dates

In [None]:
! mkdir data/ReferencesDates 

papers = pd.read_feather("data/References/0")["PaperId"].unique()

for part in os.listdir("data/Papers"):
    tmp = pd.read_feather("data/Papers/"+part)
    tmp = tmp[tmp["PaperId"].isin(papers)]
    tmp["Date"] = tmp["Date"].dt.to_period(freq = "M")
    tmp.reset_index(drop = True).to_feather("data/ReferencesDates/"+part)

In [None]:
! mkdir data/CitationsDates 

papers = pd.concat(
    pd.read_feather("data/Citations/0_CS")[["PaperId"]],
    pd.read_feather("data/Citations/0_Comp")[["PaperId"]])["PaperId"].unique()

for part in os.listdir("data/Papers"):
    tmp = pd.read_feather("data/Papers/"+part)
    tmp = tmp[tmp["PaperId"].isin(papers)]
    tmp["Date"] = tmp["Date"].dt.to_period(freq = "M")
    tmp.reset_index(drop = True).to_feather("data/CitationsDates/"+part)

## 4.2. Adding Time Component

In [None]:
dates = []
for part in os.listdir("data/ReferencesDates"):
    dates.append(pd.read_feather("data/ReferencesDates/"+part))
dates = pd.concat(dates)

for part in os.listdir("data/References"):
    tmp = pd.read_feather("data/References/"+part)
    tmp = tmp.merge(dates, on = "PaperId", how = "left")[["Date", "References"]]
    tmp = tmp.dropna()
    tmp = tmp.groupby("Date").sum().reset_index()
    tmp.to_feather("data/References/"+part)

! rm -rf data/ReferencesDates

In [None]:
dates = []
for part in os.listdir("data/CitationsDates"):
    dates.append(pd.read_feather("data/CitationsDates/"+part))
dates = pd.concat(dates)

for part in os.listdir("data/Citations"):
    tmp = pd.read_feather("data/Citations/"+part)
    tmp = tmp.merge(dates, on = "PaperId", how = "left")[["Date", "Citations"]]
    tmp = tmp.dropna()
    tmp = tmp.groupby("Date").sum().reset_index()
    tmp.to_feather("data/Citations/"+part)

! rm -rf data/CitationsDates

## 4.3. Preparing Time Series

### 4.3.1. Restricting Time

In [None]:
for part in os.listdir("data/Citations"):
    tmp = pd.read_feather("data/Citations/"+part)
    tmp = tmp[tmp["Date"].dt.year > 1959]
    tmp = tmp[tmp["Date"].dt.year < 2021]
    tmp.reset_index(drop = True).to_feather("data/Citations/"+part)

for part in os.listdir("data/References"):
    tmp = pd.read_feather("data/References/"+part)
    tmp = tmp[tmp["Date"].dt.year > 1959]
    tmp = tmp[tmp["Date"].dt.year < 2021]
    tmp.reset_index(drop = True).to_feather("data/References/"+part)

### 4.3.2. Fill in Unavailable Time Steps

In [None]:
lens = []
for part in os.listdir("data/Citations"):
    tmp = pd.read_feather("data/Citations/"+part)
    if len(tmp) != 732:
        print(part)
    lens.append(len(tmp))
print(lens)

In [None]:
fix_name = os.listdir("data/Citations")[27]
fix = pd.read_feather("data/Citations/"+fix_name)
full = os.listdir("data/Citations")[0]
full = pd.read_feather("data/Citations/"+full)
full = full[~full["Date"].isin(fix["Date"])]["Date"].values[0]
fix = pd.concat([fix, pd.DataFrame({"Date":[full], "Citations":[0]})])
fix.reset_index(drop=True).to_feather("data/Citations/"+fix_name)

In [None]:
lens = []
for part in os.listdir("data/References"):
    tmp = pd.read_feather("data/References/"+part)
    lens.append(len(tmp))
print(lens)

### 4.3.3. Adjust Citations by References

In [None]:
lens = []
for part in os.listdir("data/References"):
    refs = pd.read_feather("data/References/"+part).sort_values(by="Date")["References"]
    tmp = pd.read_feather("data/Citations/"+part+"_CS").sort_values(by="Date")
    tmp["AdjCitations"] = tmp["Citations"]/refs
    tmp.reset_index(drop = True).to_feather("data/Citations/"+part+"_CS")
    tmp = pd.read_feather("data/Citations/"+part+"_Comp").sort_values(by="Date")
    tmp["AdjCitations"] = tmp["Citations"]/refs
    tmp.reset_index(drop = True).to_feather("data/Citations/"+part+"_Comp")


### 4.3.4. Compress in Single File

In [None]:
citations = []
fields = pd.read_feather("data/Fields")[["FieldId", "Field"]]
fields = pd.concat([fields, pd.DataFrame({"FieldId":[0], "Field":["overall"]})])
for part in os.listdir("data/Citations"):
    tmp = pd.read_feather("data/Citations/"+part)
    field, condition = part.split("_")
    tmp["Field"] = fields[fields["FieldId"] == int(field)]["Field"].values[0]
    tmp["Condition"] = condition
    citations.append(tmp)
pd.concat(citations).to_csv("data/CitationsTS", index=False)

In [None]:
references = []
fields = pd.read_feather("data/Fields")[["FieldId", "Field"]]
fields = pd.concat([fields, pd.DataFrame({"FieldId":[0], "Field":["overall"]})])
for part in os.listdir("data/References"):
    tmp = pd.read_feather("data/References/"+part)
    tmp["Field"] = fields[fields["FieldId"] == int(part)]["Field"].values[0]
    references.append(tmp)
pd.concat(references).to_csv("data/ReferencesTS", index = False)

# 5. Final Steps

## 5.1. Visual Confirmation

In [None]:
ts = pd.read_csv("data/CitationsTS")

for cond in ts["Condition"].unique():
    fig = plt.figure(figsize = (21, 7))
    ax = fig.add_subplot(111)
    for field in ts["Field"].unique():
        tmp = ts[(ts["Condition"] == cond) & (ts["Field"] == field)][["Date", "AdjCitations"]]
        tmp["Date"] = pd.to_datetime(tmp["Date"]).dt.to_period(freq="M")
        tmp = tmp.set_index("Date")
        ax.plot(tmp.to_timestamp(freq = "M"), label = field)
    ax.legend()
    ax.set_title(cond)
    plt.show()

In [None]:
ts = pd.read_csv("data/CitationsTS")

for cond in ts["Condition"].unique():
    fig = plt.figure(figsize = (21, 7))
    ax = fig.add_subplot(111)
    for field in ts["Field"].unique():
        tmp = ts[(ts["Condition"] == cond) & (ts["Field"] == field)][["Date", "Citations"]]
        tmp["Date"] = pd.to_datetime(tmp["Date"]).dt.to_period(freq="M")
        tmp = tmp.set_index("Date")
        ax.plot(tmp.to_timestamp(freq = "M"), label = field)
    ax.legend()
    ax.set_title(cond)
    plt.show()

In [None]:
ts = pd.read_csv("data/ReferencesTS")

fig = plt.figure(figsize = (21, 7))
ax = fig.add_subplot(111)
for field in ts["Field"].unique():
    tmp = ts[ts["Field"] == field][["Date", "References"]]
    tmp["Date"] = pd.to_datetime(tmp["Date"]).dt.to_period(freq="M")
    tmp = tmp.set_index("Date")
    ax.plot(tmp.to_timestamp(freq = "M"), label = field)
ax.legend()
ax.set_title(cond)
plt.show()

## 5.2. Metrics

In [None]:
ts = pd.read_csv("data/CitationsTS")
ts["Date"] = pd.to_datetime(ts["Date"]).dt.to_period(freq="M")
        

metrics = ts[["Field", "Condition", "AdjCitations"]].groupby(["Field", "Condition"]).mean().reset_index()
metrics["Citations"] = ts[["Field", "Condition", "Citations"]].groupby(["Field", "Condition"]).sum().reset_index()["Citations"]
ts = ts[ts["Date"].dt.year > 1999]
metrics["AdjCitations_2000"] = ts[["Field", "Condition", "AdjCitations"]].groupby(["Field", "Condition"]).mean().reset_index()["AdjCitations"]
metrics["Citations_2000"] = ts[["Field", "Condition", "Citations"]].groupby(["Field", "Condition"]).sum().reset_index()["Citations"]
metrics.sort_values(by = "Condition").to_csv("data/Metrics", index = False)


## 5.3. Cleanup

In [None]:
! rm -rf data/Citations
! rm -rf data/CitationsDates
! rm -rf data/PaperFields
! rm -rf data/PaperReferences
! rm -rf data/Papers
! rm -rf data/References
! rm data/CitedPapers
! rm data/Fields