In [None]:
# ***************************************************************
# Preprocess OpenCitations (COCI) Dataset
# ***************************************************************

# Jupyter Notebook for the preprocessing of the Opencitations (COCI) dump
#
# In particular, the following operations are going to be executed:
# - (sequential) opening of the dump's CSVs
# - Drop of the useless columns (only 'cited' is going to remain)
# - Gruop by on the 'cited' column and count
# - Union with the dataframe of the CSVs previously elaborated
#
# Lastly, the entire preprocessed dump is going to be saved on disk in CSV format

In [1]:
# Libraries Import
import pandas as pd
import glob

In [2]:
# ******************* PATHS ********************+

# Dumps Directory Path
#path_file_import = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/2019-10-21T22_41_20_1-63/'
path_file_import = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/COCI_test/'

# CSV Exports Directory Path
#path_file_export = r'./Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Repository Github/Repository Mio/terzulli_conftur/Test Pipeline Download e Preprocess/'
path_file_export = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/Export/'

# Combine New Data with a Partial CSV
# This can be really useful in case of limited disk space, allowing us to partially process 
# the dump and free space on disk by deleting the CSVs that have been arleady processed
# Note: the delete operations need to be made manually
# Note: the partial CSV needs to be in the same format of the one generated with this script
combine_with_partial_csv = False
partial_csv_path = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/Export/out_coci_partial.csv'

In [3]:
# Get All Files' Names
coci_all_csvs = glob.glob(path_file_import + "*.csv")

In [4]:
df_coci_processed = pd.DataFrame(columns=['article', 'citations_count'])

# Combine new data with a partial CSV
if combine_with_partial_csv:
    df_coci_processed = pd.read_csv(partial_csv_path, low_memory=False)

# Read, process and concat all CSVs
for current_csv_name in coci_all_csvs:

    # Open the current CSV
    print(f'Currently processing {current_csv_name}')
    df_coci_current_csv = pd.read_csv(current_csv_name, low_memory=False)

    # Drop of the useless columns: 'oci', 'citing', 'creation', 'timespan', 'journal_sc', 'author_sc'
    df_coci_current_csv.drop(columns=['oci', 'citing', 'creation', 'timespan', 'journal_sc', 'author_sc'])

    # Group by cited article and count
    sf_coci_current_grouped = df_coci_current_csv.groupby(['cited'])['cited'].count()

    # Since the returned object is a Pandas Series type, we need to convert it to a Pandas dataframe
    df_coci_current_csv = pd.DataFrame({'article':sf_coci_current_grouped.index, 'citations_count':sf_coci_current_grouped.values})

    ### Concat with the data previously elaborated
    df_coci_processed = pd.concat([df_coci_processed, df_coci_current_csv])

    # Now we need to do a new group by and sum the citations_count to reduce the data
    sf_coci_processed_grouped = df_coci_processed.groupby(['article']).sum()
    df_coci_processed = pd.DataFrame({'article':sf_coci_processed_grouped.index, 'citations_count':sf_coci_processed_grouped.values})

# Export of the final dataframe
df_coci_processed.to_csv(path_file_export + 'out_coci_citations_count.csv')
print(f'Successfully Exported the Preprocessed CSV to {path_file_export}out_coci_citations_count.csv')


Currently processing /Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/COCI_test/2019-10-21T22:41:20_1.csv
Successfully Exported the Preprocessed CSV to /Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/Export/out_coci_citations_count.csv


In [5]:
### TEST
# Ordinamento decrescente per vedere quanto è il massimo numero di citazioni trovato
df_coci_current_csv = df_coci_current_csv.sort_values(by='citations_count', ascending=False)
df_coci_current_csv

Unnamed: 0,article,citations_count
4494252,10.2307/249008,1708
1946607,10.1017/cbo9780511815355,1185
4504345,10.2307/30036540,1128
1651683,10.1016/s0019-9958(65)90241-x,1092
923319,10.1016/0749-5978(91)90020-t,940
...,...,...
1985706,10.1021/ac2001725,1
1985707,10.1021/ac200190b,1
1985708,10.1021/ac200194d,1
1985709,10.1021/ac2002092,1


In [5]:
# Check of the Exported CSV
df_coci_exported_csv = pd.read_csv(path_file_export + 'out_coci_citations_count.csv', low_memory=False)
df_coci_exported_csv

Unnamed: 0.1,Unnamed: 0,article,citations_count
0,0,10.10.18045/zbefri.2015.2.207,1
1,1,10.1000/182,1
2,2,10.1001,2
3,3,10.1001/.391,1
4,4,10.1001/.399,1
...,...,...,...
43884589,43884589,10.9799/ksfan.2017.30.2.251,1
43884590,43884590,10.9799/ksfan.2017.30.2.282,1
43884591,43884591,10.9799/ksfan.2017.30.2.297,1
43884592,43884592,10.9799/ksfan.2017.30.2.305,1


Successfully Exported the Preprocessed CSV to /Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/Export/out_coci_citations_count.csv


In [11]:
### TEST
# Ordinamento decrescente per vedere quanto è il massimo numero di citazioni trovato
df_coci_exported_csv = df_coci_exported_csv.sort_values(by='citations_count', ascending=False)
df_coci_exported_csv

Unnamed: 0.1,Unnamed: 0,article,citations_count
23350753,23350753,10.1080/13803395.2011.645017,2
10012597,10012597,10.1016/j.apsusc.2013.12.177,2
10012598,10012598,10.1016/j.apsusc.2013.12.178,2
19482204,19482204,10.1039/a908671d,2
4638666,4638666,10.1007/bf01564257,2
...,...,...,...
15318629,15318629,10.1016/s0092-1157(87)80003-3,1
15318630,15318630,10.1016/s0092-1157(87)80004-5,1
15318631,15318631,10.1016/s0092-1157(87)80006-9,1
15318632,15318632,10.1016/s0092-1157(87)80007-0,1


In [14]:
df_coci_exported_csv.loc[df_coci_exported_csv['article'] == '10.1038/227680a0']

Unnamed: 0.1,Unnamed: 0,article,citations_count
18918638,18918638,10.1038/227680a0,2
