In [None]:
# ***************************************************************
# Preprocess OpenCitations (COCI) Dataset
# ***************************************************************

# Jupyter Notebook for the preprocessing of the Opencitations (COCI) dump
#
# In particular, the followint operations are going to be executed:
# - (sequential) opening of the dump's CSVs
# - Drop of the useless columns (only 'cited' is going to remain)
# - Gruop by on the 'cited' columns and count
# - Unione with the dataframe of the CSVs previously elaborated
#
# Lastly, the entire preprocessed dump is going to be saved on disk in CSV format

In [1]:
# Libraries Import
import pandas as pd
import glob

In [2]:
# Dumps Directory Path
#path_file_import = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/'
path_file_import = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/2021-11-15T031921_0-4_1/'

# CSV Exports Directory Path
#path_file_export = r'./Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Repository Github/Repository Mio/terzulli_conftur/Test Pipeline Download e Preprocess/'
path_file_export = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/Export/'

In [3]:
# Get All Files' Names
coci_all_csvs = glob.glob(path_file_import + "*.csv")

In [4]:
import time

In [5]:
df_coci_processed = pd.DataFrame(columns=['article', 'citations_count'])

# Read, process and concat all CSVs
for current_csv_name in coci_all_csvs:

    start_time = time.time()

    # Open the current CSV
    print(f'Currently processing {current_csv_name}')
    df_coci_current_csv = pd.read_csv(current_csv_name, low_memory=False)
    print("--- Import time %.2f seconds ---" % (time.time() - start_time))

    # Drop of the useless columns: 'oci', 'citing', 'creation', 'timespan', 'journal_sc', 'author_sc'
    df_coci_current_csv.drop(columns=['oci', 'citing', 'creation', 'timespan', 'journal_sc', 'author_sc'])
    print("--- Drop time %.2f seconds ---" % (time.time() - start_time))

    # Group by cited article and count
    sf_coci_current_grouped = df_coci_current_csv.groupby(['cited'])['cited'].count()
    print("--- Group and sort time %.2f seconds ---" % (time.time() - start_time))

    # Since the returned object is a Pandas Series type, we need to convert it to a Pandas dataframe
    df_coci_current_csv = pd.DataFrame({'article':sf_coci_current_grouped.index, 'citations_count':sf_coci_current_grouped.values})
    print("--- Df convert time %.2f seconds ---" % (time.time() - start_time))

    ### Concat with the data previously elaborated
    df_coci_processed = pd.concat([df_coci_processed, df_coci_current_csv])
    print("--- DF Concat time %.2f seconds ---" % (time.time() - start_time))

    # Now we need to do a new group by and count to reduce the data
    sf_coci_processed_grouped = df_coci_processed.groupby(['article'])['article'].count()
    print("--- Concat group time %.2f seconds ---" % (time.time() - start_time))
    df_coci_processed = pd.DataFrame({'article':sf_coci_processed_grouped.index, 'citations_count':sf_coci_processed_grouped.values})
    print("--- Concat convert time %.2f seconds ---" % (time.time() - start_time))

# Export of the final dataframe
df_coci_processed.to_csv(path_file_export + 'out_coci_citations_count.csv')


Currently processing /Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/2021-11-15T031921_0-4_1/2021-11-15T031921_3_1.csv
--- Import time 15.97 seconds ---
--- Drop time 16.04 seconds ---
--- Group and sort time 25.10 seconds ---
--- Df convert time 25.42 seconds ---
--- DF Concat time 25.49 seconds ---
--- Concat group time 31.64 seconds ---
--- Concat convert time 31.74 seconds ---
Currently processing /Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/2021-11-15T031921_0-4_1/2021-11-15T031921_1_1.csv
--- Import time 15.72 seconds ---
--- Drop time 15.80 seconds ---
--- Group and sort time 25.25 seconds ---
--- Df convert time 25.65 seconds ---
--- DF Concat time 25.79 seconds ---
--- Concat group time 41.46 seconds ---
--- Concat convert time 41.62 seconds ---
Currently processing /Users/marcoterzulli/File/Scuola Local/Magistrale/Mater

In [5]:
# Check of the Exported CSV
df_coci_exported_csv = pd.read_csv(path_file_export + 'out_coci_citations_count.csv', low_memory=False)
df_coci_exported_csv

Unnamed: 0.1,Unnamed: 0,article,citations_count
0,0,10.1001/.391,1
1,1,10.1001/.411,1
2,2,10.1001/.418,1
3,3,10.1001/.424,1
4,4,10.1001/.431,1
...,...,...,...
14539724,14539724,10.9799/ksfan.2016.29.5.643,1
14539725,14539725,10.9799/ksfan.2016.29.5.735,1
14539726,14539726,10.9799/ksfan.2016.29.6.911,2
14539727,14539727,10.9799/ksfan.2017.30.2.218,1


In [7]:
### TEST
# Ordinamento decrescente per vedere quanto è il massimo numero di citazioni trovato
df_coci_exported_csv = df_coci_exported_csv.sort_values(by='citations_count', ascending=False)
df_coci_exported_csv

Unnamed: 0.1,Unnamed: 0,article,citations_count
7269864,7269864,10.1038/35041515,2
6865347,6865347,10.1021/jp035643q,2
6865335,6865335,10.1021/jp035624g,2
6865358,6865358,10.1021/jp035657w,2
6865357,6865357,10.1021/jp0356564,2
...,...,...,...
13285982,13285982,10.21775/cimb.031.001,1
13285981,13285981,10.21775/cimb.030.089,1
13285980,13285980,10.21775/cimb.030.075,1
13286008,13286008,10.21775/cimb.042.191,1
