In [1]:
# ***************************************************************
# Preprocess Microsoft Academics Graph (MAG) Dataset
# ***************************************************************

# Jupyter Notebook for the preprocessing of the Microsoft Academics Graph (MAG) dump
#
# TODO **********
# In particular, the following operations are going to be executed:
# - (sequential) opening of the dump's CSVs
# - Drop of the useless columns (only 'cited' is going to remain)
# - Gruop by on the 'cited' column and count
# - Union with the dataframe of the CSVs previously elaborated
#
# Lastly, the entire preprocessed dump is going to be saved on disk in CSV format

In [2]:
# Libraries Import
import pandas as pd
import glob

In [3]:
# ******************* PATHS ********************+

# Dumps Directory Path
path_file_import = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/MAG/'

# CSV Exports Directory Path
path_file_export = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/Export/'

# Combine New Data with a "Partial" CSV
#
# This can be really useful in case of limited disk space, allowing us to partially process 
# the dump (using a subset of the CSVs) and free some space on disk by deleting the CSVs 
# that have been already processed.
#
# Note: the delete operations need to be made manually
# Note: the partial CSV needs to be in the same format of the one generated with this script
combine_with_partial_csv = False
partial_csv_path = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/Export/out_coci_partial.csv'

In [17]:
# ******************* CONFERENCE INSTANCES ********************
# Read of the Conference Instances File

# The column names follow the MAG' scheme official documentation
df_mag_conf_instances_col_names = ['ConferenceInstanceID', 'NormalizedName', 'DisplayName', 'ConferenceSeriesId', 'Location', 'OfficialUrl', 'StartDate', 'EndDate', 'AbstractRegistrationDate', 'SubmissionDeadlineDate', 'NotificationDueDate', 'FinalVersionDueDate', 'PageCount', 'PaperFamilyCount', 'CitationCount', 'Latitude', 'Longitude', 'CreatedDate']

df_mag_conf_instances = pd.read_csv(path_file_import + 'ConferenceInstances.txt', sep='\t', names=df_mag_conf_instances_col_names)
df_mag_conf_instances

Unnamed: 0,ConferenceInstanceID,NormalizedName,DisplayName,ConferenceSeriesId,Location,OfficialUrl,StartDate,EndDate,AbstractRegistrationDate,SubmissionDeadlineDate,NotificationDueDate,FinalVersionDueDate,PageCount,PaperFamilyCount,CitationCount,Latitude,Longitude,CreatedDate
0,7785157,time 2008,TIME 2008,2624631009,"Montreal, Canada",http://www.time2008.org/,2008-06-16,2008-06-18,,2008-01-11,,,23,23,319,45.512400,-73.554680,2016-06-24
1,15420687,ipmu 2008,IPMU 2008,1128239323,"Malaga, Spain",http://www.gimac.uma.es/ipmu08,2008-06-22,2008-06-27,,2007-12-07,,,5,5,45,36.718320,-4.420160,2016-06-24
2,16798864,wosn 2010,WOSN 2010,2756885533,"Boston, MA, USA",http://www.usenix.org/events/wosn10/cfp/,2010-06-22,2010-06-22,,2010-02-25,2010-04-30,2010-05-25,9,9,666,42.358660,-71.056740,2016-06-24
3,18230910,sasn 2009,SASN 2009,1128894334,Saint Petersburg (Russia),http://www.ieee-sasn.org/index.html,2009-10-12,2009-10-14,2009-06-19,2009-06-26,2009-07-31,2009-09-11,0,0,0,59.933180,30.306030,2016-06-24
4,31227610,eurocon 2011,EUROCON 2011,1190350587,"Lisbon, Portugal",http://www.eurocon2011.it.pt/,2011-04-27,2011-04-29,,2010-10-30,2011-01-30,2011-02-28,279,279,864,38.725700,-9.150250,2016-06-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16454,2890348533,icn 2019,ICN 2019,1147859159,"Valencia, Spain",http://iaria.org/conferences2019/ICN19.html,2019-03-24,2019-03-28,,2018-11-10,2019-01-10,2019-02-04,179,179,180,39.468990,-0.376860,2018-09-27
16455,2890856803,ismco'2019,ISMCO'2019,2898428697,"Incline Village,us",http://www.ismco.net,2019-04-29,2019-05-01,,2018-12-17,,,6,6,12,39.250090,-119.959267,2018-09-27
16456,2891744759,ro2018,RO2018,2898131225,"Amsterdam,nl",http://www.researchobject.org/ro2018/,2018-10-29,2018-10-29,2018-07-15,2018-07-15,,,1,1,0,52.353218,5.002769,2018-09-27
16457,2892113557,icccn 2019,ICCCN 2019,1137850760,"Valencia, Spain",http://icccn.org/icccn19,2019-07-29,2019-08-01,,2019-03-01,2019-04-26,2019-05-10,126,126,224,39.468990,-0.376860,2018-09-27


In [18]:
# Drop of Conference Instances' Useless Columns
df_mag_conf_instances = df_mag_conf_instances.drop(columns=['OfficialUrl', 'AbstractRegistrationDate', 'SubmissionDeadlineDate', 'NotificationDueDate', 'FinalVersionDueDate', 'PageCount', 'PaperFamilyCount', 'CitationCount', 'Latitude', 'Longitude', 'CreatedDate'])
df_mag_conf_instances

Unnamed: 0,ConferenceInstanceID,NormalizedName,DisplayName,ConferenceSeriesId,Location,StartDate,EndDate
0,7785157,time 2008,TIME 2008,2624631009,"Montreal, Canada",2008-06-16,2008-06-18
1,15420687,ipmu 2008,IPMU 2008,1128239323,"Malaga, Spain",2008-06-22,2008-06-27
2,16798864,wosn 2010,WOSN 2010,2756885533,"Boston, MA, USA",2010-06-22,2010-06-22
3,18230910,sasn 2009,SASN 2009,1128894334,Saint Petersburg (Russia),2009-10-12,2009-10-14
4,31227610,eurocon 2011,EUROCON 2011,1190350587,"Lisbon, Portugal",2011-04-27,2011-04-29
...,...,...,...,...,...,...,...
16454,2890348533,icn 2019,ICN 2019,1147859159,"Valencia, Spain",2019-03-24,2019-03-28
16455,2890856803,ismco'2019,ISMCO'2019,2898428697,"Incline Village,us",2019-04-29,2019-05-01
16456,2891744759,ro2018,RO2018,2898131225,"Amsterdam,nl",2018-10-29,2018-10-29
16457,2892113557,icccn 2019,ICCCN 2019,1137850760,"Valencia, Spain",2019-07-29,2019-08-01


In [19]:
# ******************* CONFERENCE SERIES ********************
# Read of the Conference Series File

# The column names follow the MAG' scheme official documentation
df_mag_conf_series_col_names = ['ConferenceSeriesID', 'Rank', 'NormalizedName', 'DisplayName', 'PaperCount', 'PaperFamilyCount', 'CitationCount', 'CreatedDate']

df_mag_conf_series = pd.read_csv(path_file_import + 'ConferenceSeries.txt', sep='\t', names=df_mag_conf_series_col_names)
df_mag_conf_series

Unnamed: 0,ConferenceSeriesID,Rank,NormalizedName,DisplayName,PaperCount,PaperFamilyCount,CitationCount,CreatedDate
0,1134804816,12817,ICIDS,International Conference on Interactive Digita...,611,610,2945,2016-06-24
1,1165160117,14777,SWAT4LS,Semantic Web Applications and Tools for Life S...,85,85,213,2016-06-24
2,1192093291,12271,TRIDENTCOM,Testbeds and Research Infrastructures for the ...,571,571,5174,2016-06-24
3,1199066382,10155,BIOINFORMATICS,International Conference on Bioinformatics,10692,10692,17021,2016-06-24
4,1201746639,15567,AIS,Autonomous and Intelligent Systems,165,165,1002,2016-06-24
...,...,...,...,...,...,...,...,...
4533,2754809603,14461,IPSS,IEEE International Power Sources Symposium,101,101,188,2017-09-25
4534,2756271167,13527,ECMS,European Conference on Modelling and Simulation,283,283,915,2017-09-25
4535,2756896743,17566,CAI,Conference on Algebraic Informatics,124,124,567,2017-10-06
4536,2757378734,15053,UPGRADE-CN,"Use of P2P, GRID and Agents for the Developmen...",40,40,314,2017-10-06


In [20]:
# Drop of Conference Series' Useless Columns
df_mag_conf_series = df_mag_conf_series.drop(columns=['Rank', 'PaperCount', 'PaperFamilyCount', 'CitationCount', 'CreatedDate'])
df_mag_conf_series

Unnamed: 0,ConferenceSeriesID,NormalizedName,DisplayName
0,1134804816,ICIDS,International Conference on Interactive Digita...
1,1165160117,SWAT4LS,Semantic Web Applications and Tools for Life S...
2,1192093291,TRIDENTCOM,Testbeds and Research Infrastructures for the ...
3,1199066382,BIOINFORMATICS,International Conference on Bioinformatics
4,1201746639,AIS,Autonomous and Intelligent Systems
...,...,...,...
4533,2754809603,IPSS,IEEE International Power Sources Symposium
4534,2756271167,ECMS,European Conference on Modelling and Simulation
4535,2756896743,CAI,Conference on Algebraic Informatics
4536,2757378734,UPGRADE-CN,"Use of P2P, GRID and Agents for the Developmen..."


In [None]:
# Get All Files' Names
coci_all_csvs = glob.glob(path_file_import + "*.csv")

In [None]:
df_coci_processed = pd.DataFrame(columns=['article', 'citations_count'])

# Combine new data with a partial CSV
if combine_with_partial_csv:
    df_coci_processed = pd.read_csv(partial_csv_path, low_memory=False)
    print(f'Successfully Imported the Partial CSV')
    
# Read, process and concat all CSVs
count = 0
for current_csv_name in coci_all_csvs:

    # Open the current CSV
    print(f'Currently processing CSV {count}: {current_csv_name}')
    count += 1
    df_coci_current_csv = pd.read_csv(current_csv_name, low_memory=False)

    # Drop of the useless columns: 'oci', 'citing', 'creation', 'timespan', 'journal_sc', 'author_sc'
    df_coci_current_csv.drop(columns=['oci', 'citing', 'creation', 'timespan', 'journal_sc', 'author_sc'])

    # Group by cited article and count
    sf_coci_current_grouped = df_coci_current_csv.groupby(['cited'])['cited'].count()

    # Since the returned object is a Pandas Series type, we need to convert it to a Pandas dataframe
    df_coci_current_csv = pd.DataFrame({'article':sf_coci_current_grouped.index, 'citations_count':sf_coci_current_grouped.values})

    ### Concat with the data previously elaborated
    df_coci_processed = pd.concat([df_coci_processed, df_coci_current_csv])

    # Now we need to do a new group by and sum the citations_count to reduce the data
    sf_coci_processed_grouped = df_coci_processed.groupby(['article'])['citations_count'].sum()
    df_coci_processed = pd.DataFrame({'article':sf_coci_processed_grouped.index, 'citations_count':sf_coci_processed_grouped.values})

# Export of the final dataframe
df_coci_processed.to_csv(path_file_export + 'out_coci_citations_count.csv')
print(f'Successfully Exported the Preprocessed CSV to {path_file_export}out_coci_citations_count.csv')


In [None]:
# Check of the Exported CSV
df_coci_exported_csv = pd.read_csv(path_file_export + 'out_coci_citations_count.csv', low_memory=False)
df_coci_exported_csv

In [None]:
# Order by citations count descending to see the articles with the most citations
df_coci_exported_csv = df_coci_exported_csv.sort_values(by='citations_count', ascending=False)
df_coci_exported_csv

In [None]:
# Checking the total count of the citations contained in the extracted CSV
df_coci_exported_csv['citations_count'].sum()