In [None]:
# ***************************************************************
# Preprocess Microsoft Academics Graph (MAG) Dataset
# ***************************************************************

# Jupyter Notebook for the preprocessing of the Microsoft Academics Graph (MAG) dump
#
# TODO **********
# In particular, the following operations are going to be executed:
# - Opening of ConferenceInstances and ConferenceSeries CSVs
# - Drop of the useless columns 
# - Merge of the two CSVs on the ConferenceSeriesID column
# - Chuncked Processing of the Papers CSV
# ---- Drop of the useless columns
# ---- Drop of papers from journals and books rows
# - Merge with the processed conferences data
#
# Lastly, the entire preprocessed dump is going to be saved on disk in CSV format

In [1]:
# Libraries Import
import pandas as pd

pd.set_option('display.max_columns', None)

In [2]:
# ******************* PATHS ********************+

# Dumps Directory Path
path_file_import = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/MAG/'

# CSV Exports Directory Path
path_file_export = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/Export/MAG_Chunks/'

# Use a Previuously Preprocessed Papers CSV
#
# This can be really useful to save some time using a previously elaborated
# CSV file. We don't need to repeat the same operations!
#
# Note: the CSV needs to be in the same format of the one generated with this script
read_preprocessed_papers = True
preprocessed_papers_csv_path = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/Export/out_mag_papers.csv'


In [None]:
# ******************* CONFERENCE INSTANCES ********************

# Read of the Conference Instances File

# The column names follow the MAG' scheme official documentation
df_mag_conf_instances_col_names = ['ConferenceInstanceID', 'NormalizedName', 'DisplayName', 'ConferenceSeriesID', 'Location', 'OfficialUrl', 'StartDate', 'EndDate', 'AbstractRegistrationDate', 'SubmissionDeadlineDate', 'NotificationDueDate', 'FinalVersionDueDate', 'PageCount', 'PaperFamilyCount', 'CitationCount', 'Latitude', 'Longitude', 'CreatedDate']

df_mag_conf_instances = pd.read_csv(path_file_import + 'ConferenceInstances.txt', sep='\t', names=df_mag_conf_instances_col_names)
df_mag_conf_instances

In [None]:
# Drop of Conference Instances' Useless Columns
df_mag_conf_instances = df_mag_conf_instances.drop(columns=['OfficialUrl', 'AbstractRegistrationDate', 'SubmissionDeadlineDate', 'NotificationDueDate', 'FinalVersionDueDate', 'PageCount', 'PaperFamilyCount', 'CitationCount', 'Latitude', 'Longitude', 'CreatedDate', 'StartDate', 'EndDate'])
df_mag_conf_instances

In [None]:
# Column rename to remove ambiguity for the future joins
df_mag_conf_instances.rename(columns={'NormalizedName': 'ConferenceNormalizedName', 'DisplayName': 'ConferenceDisplayName', 'Location': 'ConferenceLocation'}, inplace=True)
df_mag_conf_instances

In [None]:
# ******************* CONFERENCE SERIES ********************

# Read of the Conference Series File

# The column names follow the MAG' scheme official documentation
df_mag_conf_series_col_names = ['ConferenceSeriesID', 'Rank', 'NormalizedName', 'DisplayName', 'PaperCount', 'PaperFamilyCount', 'CitationCount', 'CreatedDate']

df_mag_conf_series = pd.read_csv(path_file_import + 'ConferenceSeries.txt', sep='\t', names=df_mag_conf_series_col_names)
df_mag_conf_series

In [None]:
# Drop of Conference Series' Useless Columns
df_mag_conf_series = df_mag_conf_series.drop(columns=['Rank', 'PaperCount', 'PaperFamilyCount', 'CitationCount', 'CreatedDate'])
df_mag_conf_series

In [None]:
# Column rename to remove ambiguity for the future joins
df_mag_conf_series.rename(columns={'NormalizedName': 'ConferenceSeriesNormalizedName', 'DisplayName': 'ConferenceSeriesDisplayName'}, inplace=True)
df_mag_conf_series

In [None]:
# ******************* MERGE OF CONFERENCE DATA ********************

# Merge of the conference series and conference instances dataframes over the conferenceseriesid column
df_mag_conf_merged = df_mag_conf_instances.merge(df_mag_conf_series, on='ConferenceSeriesID')
df_mag_conf_merged

In [5]:
# ******************* PAPERS ********************

# Read of previously prerocessed CSV
df_mag_papers = None
if read_preprocessed_papers:
    df_mag_papers = pd.read_csv(preprocessed_papers_csv_path, low_memory=False, index_col=0)
else:
    # The Papers CSV is going to be processed in chunks, due to its size

    # The column names follow the MAG' scheme official documentation
    df_mag_papers_col_names = ['PaperID', 'Rank', 'Doi', 'DocType', 'PaperTitle', 'OriginalTitle', 'BookTitle', 'Year', 'Date', 'OnlineDate', 'Publisher', 'JournalID', 'ConferenceSeriesID', 'ConferenceInstanceID', 'Volume', 'Issue', 'FirstPage', 'LastPage', 'ReferenceCount', 'CitationCount', 'EstimatedCitation', 'OriginalVenue', 'FamilyID', 'FamilyRank', 'Retracion', 'CreatedDate']

    # List of processed chunks.
    df_mag_papers_list_of_chunks = list()

    # Define of the chunk size
    chunksize = 10 ** 7

    count = 1
    with pd.read_csv(path_file_import + 'Papers.txt', sep='\t', chunksize=chunksize, low_memory=False, on_bad_lines='skip', names=df_mag_papers_col_names) as reader:
        for chunk in reader:

            # Drop of the useless columns
            chunk = chunk.drop(columns=['Rank', 'OnlineDate', 'Publisher', 'Volume', 'Issue', 'FirstPage', 'LastPage', 'ReferenceCount', 'OriginalVenue', 'FamilyID', 'FamilyRank', 'Retracion', 'CreatedDate', 'JournalID', 'BookTitle', 'Date'])

            # Filtering of papers without DOI
            chunk = chunk.dropna(subset = ['Doi'])

            # Filtering papers that are not related to conferences
            chunk = chunk[chunk.DocType == 'Conference']

            # Drop of the doctype column
            chunk = chunk.drop(columns=['DocType'])

            # Insert of the resulting chunk in the list 
            df_mag_papers_list_of_chunks.append(chunk)

            print(f'Successfully processed chunk {count} out of around {260000000 / chunksize}')
            count += 1
            break

    # Concatenation of the processed chunks
    df_mag_papers = pd.concat(df_mag_papers_list_of_chunks)

    # Empty the list to free some memory
    df_mag_papers_list_of_chunks = list()

    # Write of the resulting CSV on Disk
    df_mag_papers.to_csv(path_file_export + 'out_mag_papers.csv')

In [6]:
df_mag_papers

Unnamed: 0,PaperID,Doi,PaperTitle,OriginalTitle,Year,ConferenceSeriesID,ConferenceInstanceID,CitationCount,EstimatedCitation
37,14558443,10.1007/978-3-662-45174-8_28,the adaptive priority queue with elimination a...,The Adaptive Priority Queue with Elimination a...,2014.0,1.131603e+09,4038532.0,12.0,12
39,15354235,10.1007/978-3-662-44777-2_60,document retrieval on repetitive collections,Document Retrieval on Repetitive Collections,2014.0,1.154039e+09,157008481.0,10.0,10
68,24327294,10.1007/978-3-319-03973-2_13,socomo marketing for travel and tourism,SoCoMo Marketing for Travel and Tourism,2013.0,1.196984e+09,,20.0,20
197,60437532,10.1007/3-540-46146-9_77,similarity image retrieval system using hierar...,Similarity Image Retrieval System Using Hierar...,2002.0,1.192665e+09,,0.0,0
666,198056957,10.1007/11785231_94,leukemia prediction from gene expression data ...,Leukemia prediction from gene expression data—...,2006.0,1.176896e+09,,19.0,19
...,...,...,...,...,...,...,...,...,...
259718386,3102242761,10.1109/IECON43393.2020.9254316,loss reduction by synchronous rectification in...,Loss Reduction by Synchronous Rectification in...,2020.0,2.623572e+09,,0.0,0
259718500,3136855299,10.1109/BMSB49480.2020.9379806,data over cable services improving the bicm ca...,Data Over Cable Services – Improving the BICM ...,2020.0,2.623662e+09,,0.0,0
259718537,3145351916,10.1109/ACC.1988.4172843,model reference robust adaptive control withou...,Model Reference Robust Adaptive Control withou...,1988.0,2.238538e+09,,0.0,0
259718570,3151696876,10.1109/ICASSP.2002.1005676,missing data speech recognition in reverberant...,Missing data speech recognition in reverberant...,2002.0,1.121228e+09,,0.0,0


In [9]:

# Filtering of papers without DOI
df_mag_papers_cp = df_mag_papers
df_mag_papers_cp = df_mag_papers_cp.dropna(subset = ['ConferenceInstanceID'])
df_mag_papers_cp

Unnamed: 0,PaperID,Doi,PaperTitle,OriginalTitle,Year,ConferenceSeriesID,ConferenceInstanceID,CitationCount,EstimatedCitation
37,14558443,10.1007/978-3-662-45174-8_28,the adaptive priority queue with elimination a...,The Adaptive Priority Queue with Elimination a...,2014,1.131603e+09,4.038532e+06,12.0,12
39,15354235,10.1007/978-3-662-44777-2_60,document retrieval on repetitive collections,Document Retrieval on Repetitive Collections,2014,1.154039e+09,1.570085e+08,10.0,10
2181,1459710595,10.1007/978-3-319-08958-4_17,enhancing labeled data using unlabeled data fo...,Enhancing Labeled Data Using Unlabeled Data fo...,2011,2.760407e+09,2.019264e+08,0.0,0
2192,2161259116,10.1109/CVPR.2013.65,improved image set classification via joint sp...,Improved Image Set Classification via Joint Sp...,2013,1.158168e+09,1.383691e+08,75.0,95
2523,1488332647,10.1109/ISBMSB.2008.4536671,a novel channel estimation based on spread pil...,A novel channel estimation based on spread pil...,2008,2.623662e+09,2.626942e+09,4.0,4
...,...,...,...,...,...,...,...,...,...
259717113,2789624042,10.1109/ISSCC.2018.8310368,a 95 2 efficiency dual path dc dc step up conv...,A 95.2% efficiency dual-path DC-DC step-up con...,2018,1.183230e+09,2.788559e+09,8.0,8
259717694,2893506456,10.1007/978-3-030-01418-6_54,learning preferences for large scale multi lab...,Learning Preferences for Large Scale Multi-lab...,2018,1.158833e+09,2.892231e+09,1.0,1
259717766,2914533952,10.1145/3308558.3313726,city wide signal strength maps prediction with...,City-Wide Signal Strength Maps: Prediction wit...,2019,1.135342e+09,2.890478e+09,8.0,8
259717787,2921629601,10.1109/ICOSC.2019.8665672,constructing and maintaining corpus driven ann...,Constructing and Maintaining Corpus-Driven Ann...,2019,2.898614e+09,2.889800e+09,1.0,1


In [None]:
# ******************* PAPERS ********************

# The Papers CSV is going to be processed in chunks, due to its size

# The column names follow the MAG' scheme official documentation
df_mag_papers_col_names = ['PaperID', 'Rank', 'Doi', 'DocType', 'PaperTitle', 'OriginalTitle', 'BookTitle', 'Year', 'Date', 'OnlineDate', 'Publisher', 'JournalID', 'ConferenceSeriesID', 'ConferenceInstanceID', 'Volume', 'Issue', 'FirstPage', 'LastPage', 'ReferenceCount', 'CitationCount', 'EstimatedCitation', 'OriginalVenue', 'FamilyID', 'FamilyRank', 'CreatedDate']

# List of processed chunks.
df_mag_papers_list_of_chunks = list()

# Define of the chunk size
chunksize = 10 ** 7

count = 1
with pd.read_csv(path_file_import + 'Papers.txt', sep='\t', names=df_mag_papers_col_names, chunksize=chunksize, low_memory=False, on_bad_lines='skip') as reader:
    for chunk in reader:
        print(f'Currently processing chunk {count} out of around 400')
        count += 1

        # Drop of the useless columns
        chunk = chunk.drop(columns=['Rank', 'OnlineDate', 'Publisher', 'Volume', 'Issue', 'FirstPage', 'LastPage', 'ReferenceCount', 'OriginalVenue', 'FamilyID', 'FamilyRank', 'CreatedDate'])

        # Filtering of books and Journals papers
        # TODO

        # Insert of the resulting chunk in the list 
        df_mag_papers_list_of_chunks.append(chunk)

# Concatenation of the processed chunks
df_mag_papers = pd.concat(df_mag_papers_list_of_chunks)

# Empty the list to free some memory
df_mag_papers_list_of_chunks = list()

In [None]:
df_mag_papers

In [None]:
# Export of the partial dataframe
df_mag_papers.to_csv(path_file_export + 'out_mag_citations_partial.csv')
print(f'Successfully Exported the Partial Preprocessed CSV to {path_file_export}out_mag_citations_partial.csv')


In [None]:
# Get All Files' Names
coci_all_csvs = glob.glob(path_file_import + "*.csv")

In [None]:
df_coci_processed = pd.DataFrame(columns=['article', 'citations_count'])

# Combine new data with a partial CSV
if combine_with_partial_csv:
    df_coci_processed = pd.read_csv(partial_csv_path, low_memory=False)
    print(f'Successfully Imported the Partial CSV')
    
# Read, process and concat all CSVs
count = 0
for current_csv_name in coci_all_csvs:

    # Open the current CSV
    print(f'Currently processing CSV {count}: {current_csv_name}')
    count += 1
    df_coci_current_csv = pd.read_csv(current_csv_name, low_memory=False)

    # Drop of the useless columns: 'oci', 'citing', 'creation', 'timespan', 'journal_sc', 'author_sc'
    df_coci_current_csv.drop(columns=['oci', 'citing', 'creation', 'timespan', 'journal_sc', 'author_sc'])

    # Group by cited article and count
    sf_coci_current_grouped = df_coci_current_csv.groupby(['cited'])['cited'].count()

    # Since the returned object is a Pandas Series type, we need to convert it to a Pandas dataframe
    df_coci_current_csv = pd.DataFrame({'article':sf_coci_current_grouped.index, 'citations_count':sf_coci_current_grouped.values})

    ### Concat with the data previously elaborated
    df_coci_processed = pd.concat([df_coci_processed, df_coci_current_csv])

    # Now we need to do a new group by and sum the citations_count to reduce the data
    sf_coci_processed_grouped = df_coci_processed.groupby(['article'])['citations_count'].sum()
    df_coci_processed = pd.DataFrame({'article':sf_coci_processed_grouped.index, 'citations_count':sf_coci_processed_grouped.values})

# Export of the final dataframe
df_coci_processed.to_csv(path_file_export + 'out_coci_citations_count.csv')
print(f'Successfully Exported the Preprocessed CSV to {path_file_export}out_coci_citations_count.csv')


In [None]:
# Check of the Exported CSV
df_coci_exported_csv = pd.read_csv(path_file_export + 'out_coci_citations_count.csv', low_memory=False)
df_coci_exported_csv

In [None]:
# Order by citations count descending to see the articles with the most citations
df_coci_exported_csv = df_coci_exported_csv.sort_values(by='citations_count', ascending=False)
df_coci_exported_csv

In [None]:
# Checking the total count of the citations contained in the extracted CSV
df_coci_exported_csv['citations_count'].sum()