In [None]:
# ***************************************************************
# Preprocess OpenCitations (COCI) Dataset
# ***************************************************************

# Jupyter Notebook for the preprocessing of the Opencitations (COCI) dump
#
# In particular, the following operations are going to be executed:
# - (sequential) opening of the dump's CSVs
# - Drop of the useless columns (only 'cited' is going to remain)
# - Gruop by on the 'cited' column and count
# - Union with the dataframe of the CSVs previously elaborated
#
# Lastly, the entire preprocessed dump is going to be saved on disk in CSV format

In [1]:
# Libraries Import
import pandas as pd
import glob

In [2]:
# ******************* PATHS ********************+

# Dumps Directory Path
path_file_import = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/2019-10-21T22_41_20_1-63/'
#path_file_import = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/COCI_test/'

# CSV Exports Directory Path
#path_file_export = r'./Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Repository Github/Repository Mio/terzulli_conftur/Test Pipeline Download e Preprocess/'
path_file_export = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/Export/'

# Combine New Data with a Partial CSV
# This can be really useful in case of limited disk space, allowing us to partially process 
# the dump and free space on disk by deleting the CSVs that have been arleady processed
# Note: the delete operations need to be made manually
# Note: the partial CSV needs to be in the same format of the one generated with this script
combine_with_partial_csv = False
partial_csv_path = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/Export/out_coci_partial.csv'

In [8]:
# Get All Files' Names
coci_all_csvs = glob.glob(path_file_import + "*.csv")

In [9]:
df_coci_processed = pd.DataFrame(columns=['article', 'citations_count'])

# Combine new data with a partial CSV
if combine_with_partial_csv:
    df_coci_processed = pd.read_csv(partial_csv_path, low_memory=False)

# Read, process and concat all CSVs
for current_csv_name in coci_all_csvs:

    # Open the current CSV
    print(f'Currently processing {current_csv_name}')
    df_coci_current_csv = pd.read_csv(current_csv_name, low_memory=False)

    # Drop of the useless columns: 'oci', 'citing', 'creation', 'timespan', 'journal_sc', 'author_sc'
    df_coci_current_csv.drop(columns=['oci', 'citing', 'creation', 'timespan', 'journal_sc', 'author_sc'])

    # Group by cited article and count
    sf_coci_current_grouped = df_coci_current_csv.groupby(['cited'])['cited'].count()

    # Since the returned object is a Pandas Series type, we need to convert it to a Pandas dataframe
    df_coci_current_csv = pd.DataFrame({'article':sf_coci_current_grouped.index, 'citations_count':sf_coci_current_grouped.values})

    ### Concat with the data previously elaborated
    df_coci_processed = pd.concat([df_coci_processed, df_coci_current_csv])

    # Now we need to do a new group by and sum the citations_count to reduce the data
    sf_coci_processed_grouped = df_coci_processed.groupby(['article'])['citations_count'].sum()
    df_coci_processed = pd.DataFrame({'article':sf_coci_processed_grouped.index, 'citations_count':sf_coci_processed_grouped.values})

# Export of the final dataframe
df_coci_processed.to_csv(path_file_export + 'out_coci_citations_count.csv')
print(f'Successfully Exported the Preprocessed CSV to {path_file_export}out_coci_citations_count.csv')


Currently processing /Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/COCI_test/2019-10-21T22:41:20_2.csv
Currently processing /Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/COCI_test/2019-10-21T22:41:20_1.csv
Successfully Exported the Preprocessed CSV to /Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/Export/out_coci_citations_count.csv


In [10]:
# Check of the Exported CSV
df_coci_exported_csv = pd.read_csv(path_file_export + 'out_coci_citations_count.csv', low_memory=False)
df_coci_exported_csv

Unnamed: 0.1,Unnamed: 0,article,citations_count
0,0,10.1001/.391,1
1,1,10.1001/.405,1
2,2,10.1001/.418,2
3,3,10.1001/.444,1
4,4,10.1001/.458,4
...,...,...,...
8831171,8831171,10.9799/ksfan.2017.30.1.074,1
8831172,8831172,10.9799/ksfan.2017.30.1.083,1
8831173,8831173,10.9799/ksfan.2017.30.1.129,2
8831174,8831174,10.9799/ksfan.2017.30.2.203,5


In [11]:
# Order by citations count descending to see the articles with the most citations
df_coci_exported_csv = df_coci_exported_csv.sort_values(by='citations_count', ascending=False)
df_coci_exported_csv

Unnamed: 0.1,Unnamed: 0,article,citations_count
6447698,6447698,10.1136/bmj.327.7414.557,3923
6444696,6444696,10.1136/bmj.315.7109.629,2946
152022,152022,10.1002/9780470712184,2578
50617,50617,10.1001/jama.1995.03520290060030,1900
7870081,7870081,10.2307/249008,1880
...,...,...,...
3624131,3624131,10.1021/ja3050713,1
3624130,3624130,10.1021/ja3050579,1
3624129,3624129,10.1021/ja305051u,1
3624128,3624128,10.1021/ja305046u,1


In [12]:
### TEST
# Ricerca di una specifica occorrenza
df_coci_exported_csv.loc[df_coci_exported_csv['article'] == '10.1038/227680a0']

Unnamed: 0.1,Unnamed: 0,article,citations_count
3912038,3912038,10.1038/227680a0,936
