In [None]:
# ***************************************************************
# Preprocess Microsoft Academics Graph (MAG) Dataset
# ***************************************************************

# Jupyter Notebook for the preprocessing of the Microsoft Academics Graph (MAG) dump
#
# TODO **********
# In particular, the following operations are going to be executed:
# - Opening of ConferenceInstances and ConferenceSeries CSVs
# - Drop of the useless columns 
# - Merge of the two CSVs on the ConferenceSeriesID column
# - Chuncked Processing of the Papers CSV
# ---- Drop of the useless columns
# ---- Drop of papers from journals and books rows
# - Merge with the processed conferences data
#
# Lastly, the entire preprocessed dump is going to be saved on disk in CSV format

In [1]:
# Libraries Import
import pandas as pd

pd.set_option('display.max_columns', None)

In [2]:
# ******************* PATHS ********************+

# Dumps Directory Path
path_file_import = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/MAG/'

# CSV Exports Directory Path
path_file_export = r'/Users/marcoterzulli/File/Scuola Local/Magistrale/Materiale Corsi Attuali/Tirocinio/Cartella di Lavoro/Archivi Dump di Lavoro/Export/MAG_Chunks/'

In [59]:
# ******************* CONFERENCE INSTANCES ********************

# Read of the Conference Instances File

# The column names follow the MAG' scheme official documentation
df_mag_conf_instances_col_names = ['ConferenceInstanceID', 'NormalizedName', 'DisplayName', 'ConferenceSeriesID', 'Location', 'OfficialUrl', 'StartDate', 'EndDate', 'AbstractRegistrationDate', 'SubmissionDeadlineDate', 'NotificationDueDate', 'FinalVersionDueDate', 'PageCount', 'PaperFamilyCount', 'CitationCount', 'Latitude', 'Longitude', 'CreatedDate']

df_mag_conf_instances = pd.read_csv(path_file_import + 'ConferenceInstances.txt', sep='\t', names=df_mag_conf_instances_col_names)
df_mag_conf_instances

Unnamed: 0,ConferenceInstanceID,NormalizedName,DisplayName,ConferenceSeriesID,Location,OfficialUrl,StartDate,EndDate,AbstractRegistrationDate,SubmissionDeadlineDate,NotificationDueDate,FinalVersionDueDate,PageCount,PaperFamilyCount,CitationCount,Latitude,Longitude,CreatedDate
0,7785157,time 2008,TIME 2008,2624631009,"Montreal, Canada",http://www.time2008.org/,2008-06-16,2008-06-18,,2008-01-11,,,23,23,319,45.512400,-73.554680,2016-06-24
1,15420687,ipmu 2008,IPMU 2008,1128239323,"Malaga, Spain",http://www.gimac.uma.es/ipmu08,2008-06-22,2008-06-27,,2007-12-07,,,5,5,45,36.718320,-4.420160,2016-06-24
2,16798864,wosn 2010,WOSN 2010,2756885533,"Boston, MA, USA",http://www.usenix.org/events/wosn10/cfp/,2010-06-22,2010-06-22,,2010-02-25,2010-04-30,2010-05-25,9,9,666,42.358660,-71.056740,2016-06-24
3,18230910,sasn 2009,SASN 2009,1128894334,Saint Petersburg (Russia),http://www.ieee-sasn.org/index.html,2009-10-12,2009-10-14,2009-06-19,2009-06-26,2009-07-31,2009-09-11,0,0,0,59.933180,30.306030,2016-06-24
4,31227610,eurocon 2011,EUROCON 2011,1190350587,"Lisbon, Portugal",http://www.eurocon2011.it.pt/,2011-04-27,2011-04-29,,2010-10-30,2011-01-30,2011-02-28,279,279,864,38.725700,-9.150250,2016-06-24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16454,2890348533,icn 2019,ICN 2019,1147859159,"Valencia, Spain",http://iaria.org/conferences2019/ICN19.html,2019-03-24,2019-03-28,,2018-11-10,2019-01-10,2019-02-04,179,179,180,39.468990,-0.376860,2018-09-27
16455,2890856803,ismco'2019,ISMCO'2019,2898428697,"Incline Village,us",http://www.ismco.net,2019-04-29,2019-05-01,,2018-12-17,,,6,6,12,39.250090,-119.959267,2018-09-27
16456,2891744759,ro2018,RO2018,2898131225,"Amsterdam,nl",http://www.researchobject.org/ro2018/,2018-10-29,2018-10-29,2018-07-15,2018-07-15,,,1,1,0,52.353218,5.002769,2018-09-27
16457,2892113557,icccn 2019,ICCCN 2019,1137850760,"Valencia, Spain",http://icccn.org/icccn19,2019-07-29,2019-08-01,,2019-03-01,2019-04-26,2019-05-10,126,126,224,39.468990,-0.376860,2018-09-27


In [None]:
# Drop of Conference Instances' Useless Columns
df_mag_conf_instances = df_mag_conf_instances.drop(columns=['OfficialUrl', 'AbstractRegistrationDate', 'SubmissionDeadlineDate', 'NotificationDueDate', 'FinalVersionDueDate', 'PageCount', 'PaperFamilyCount', 'CitationCount', 'Latitude', 'Longitude', 'CreatedDate', 'StartDate', 'EndDate'])
df_mag_conf_instances

In [None]:
# Column rename to remove ambiguity for the future joins
df_mag_conf_instances.rename(columns={'NormalizedName': 'ConferenceNormalizedName', 'DisplayName': 'ConferenceDisplayName', 'Location': 'ConferenceLocation'}, inplace=True)
df_mag_conf_instances

In [60]:
# ******************* CONFERENCE SERIES ********************

# Read of the Conference Series File

# The column names follow the MAG' scheme official documentation
df_mag_conf_series_col_names = ['ConferenceSeriesID', 'Rank', 'NormalizedName', 'DisplayName', 'PaperCount', 'PaperFamilyCount', 'CitationCount', 'CreatedDate']

df_mag_conf_series = pd.read_csv(path_file_import + 'ConferenceSeries.txt', sep='\t', names=df_mag_conf_series_col_names)
df_mag_conf_series

Unnamed: 0,ConferenceSeriesID,Rank,NormalizedName,DisplayName,PaperCount,PaperFamilyCount,CitationCount,CreatedDate
0,1134804816,12817,ICIDS,International Conference on Interactive Digita...,611,610,2945,2016-06-24
1,1165160117,14777,SWAT4LS,Semantic Web Applications and Tools for Life S...,85,85,213,2016-06-24
2,1192093291,12271,TRIDENTCOM,Testbeds and Research Infrastructures for the ...,571,571,5174,2016-06-24
3,1199066382,10155,BIOINFORMATICS,International Conference on Bioinformatics,10692,10692,17021,2016-06-24
4,1201746639,15567,AIS,Autonomous and Intelligent Systems,165,165,1002,2016-06-24
...,...,...,...,...,...,...,...,...
4533,2754809603,14461,IPSS,IEEE International Power Sources Symposium,101,101,188,2017-09-25
4534,2756271167,13527,ECMS,European Conference on Modelling and Simulation,283,283,915,2017-09-25
4535,2756896743,17566,CAI,Conference on Algebraic Informatics,124,124,567,2017-10-06
4536,2757378734,15053,UPGRADE-CN,"Use of P2P, GRID and Agents for the Developmen...",40,40,314,2017-10-06


In [61]:
# Drop of Conference Series' Useless Columns
df_mag_conf_series = df_mag_conf_series.drop(columns=['Rank', 'PaperCount', 'PaperFamilyCount', 'CitationCount', 'CreatedDate'])
df_mag_conf_series

Unnamed: 0,ConferenceSeriesID,NormalizedName,DisplayName
0,1134804816,ICIDS,International Conference on Interactive Digita...
1,1165160117,SWAT4LS,Semantic Web Applications and Tools for Life S...
2,1192093291,TRIDENTCOM,Testbeds and Research Infrastructures for the ...
3,1199066382,BIOINFORMATICS,International Conference on Bioinformatics
4,1201746639,AIS,Autonomous and Intelligent Systems
...,...,...,...
4533,2754809603,IPSS,IEEE International Power Sources Symposium
4534,2756271167,ECMS,European Conference on Modelling and Simulation
4535,2756896743,CAI,Conference on Algebraic Informatics
4536,2757378734,UPGRADE-CN,"Use of P2P, GRID and Agents for the Developmen..."


In [None]:
# Column rename to remove ambiguity for the future joins
df_mag_conf_series.rename(columns={'NormalizedName': 'ConferenceSeriesNormalizedName', 'DisplayName': 'ConferenceSeriesDisplayName'}, inplace=True)
df_mag_conf_series

In [None]:
# ******************* MERGE OF CONFERENCE DATA ********************

# Merge of the conference series and conference instances dataframes over the conferenceseriesid column
df_mag_conf_merged = df_mag_conf_instances.merge(df_mag_conf_series, on='ConferenceSeriesID')
df_mag_conf_merged

In [77]:
# ******************* PAPERS ********************

# The Papers CSV is going to be processed in chunks, due to its size

# The column names follow the MAG' scheme official documentation
df_mag_papers_col_names = ['PaperID', 'Rank', 'Doi', 'DocType', 'PaperTitle', 'OriginalTitle', 'BookTitle', 'Year', 'Date', 'OnlineDate', 'Publisher', 'JournalID', 'ConferenceSeriesID', 'ConferenceInstanceID', 'Volume', 'Issue', 'FirstPage', 'LastPage', 'ReferenceCount', 'CitationCount', 'EstimatedCitation', 'OriginalVenue', 'FamilyID', 'FamilyRank', 'Retracion', 'CreatedDate']

# List of processed chunks.
df_mag_papers_list_of_chunks = list()

# Define of the chunk size
chunksize = 10 ** 5

count = 2
with pd.read_csv(path_file_import + 'Papers.txt', sep='\t', chunksize=chunksize, low_memory=False, on_bad_lines='skip', names=df_mag_papers_col_names) as reader:
    for chunk in reader:
        print(f'Currently processing chunk {count} out of around {200000000 / chunksize}')

        #chunk = chunk.set_index('PaperID')

        # Drop of the useless columns
        chunk = chunk.drop(columns=['Rank', 'OnlineDate', 'Publisher', 'Volume', 'Issue', 'FirstPage', 'LastPage', 'ReferenceCount', 'OriginalVenue', 'FamilyID', 'FamilyRank', 'Retracion', 'CreatedDate', 'JournalID', 'BookTitle'])

        # Filtering of papers without DOI
        chunk = chunk.dropna(subset = ['Doi'])

        # Filtering of books and Journals papers
        # TODO

        # Writing the partial CSV on Disk
        chunk.to_csv(path_file_export + 'out_mag_citations_chunk_'+ str(count) +'.csv')
        count += 1

# Concatenation of the processed chunks
#df_mag_papers = pd.concat(df_mag_papers_list_of_chunks)

# Empty the list to free some memory
#df_mag_papers_list_of_chunks = list()

Currently processing chunk 2 out of around 2000.0
Currently processing chunk 3 out of around 2000.0
Currently processing chunk 4 out of around 2000.0


KeyboardInterrupt: 

In [78]:
chunk

Unnamed: 0,PaperID,Doi,DocType,PaperTitle,OriginalTitle,Year,Date,ConferenceSeriesID,ConferenceInstanceID,CitationCount,EstimatedCitation
200001,2772839314,10.23870/21,,john peeler building democracy in latin americ...,John Peeler. Building Democracy in Latin Ameri...,2007.0,2007-01-01,,,0,0
200009,2774597174,10.4172/2155-9562-C1-056,,preventive effects of resveratrol against chem...,Preventive effects of Resveratrol against chem...,2017.0,2017-09-01,,,0,0
200015,2776076678,10.1107/S2053273317091239,Journal,dna structural alphabet opens ways to understa...,DNA structural alphabet opens ways to understa...,2017.0,2017-12-01,,,0,0
200016,2776430014,10.5696/2156-9614-7.16.2,Journal,heavy metal pollution near a tannery in ulaanb...,Heavy Metal Pollution Near a Tannery in Ulaanb...,2017.0,2017-12-19,,,4,4
200024,2779283836,10.1515/CEJPP-2016-0035,Journal,childcare policy in the czech republic and nor...,Childcare policy in the Czech Republic and Nor...,2017.0,2017-12-01,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...
299992,2916209657,10.30682/AA1801P,,trentino territorio paesaggio e architettura d...,"Trentino. Territorio, paesaggio e architettura...",2018.0,2018-11-15,,,0,0
299994,2916723923,10.1109/GLOCOM.2018.8648031,Conference,intelligent and energy efficient mobile smartp...,Intelligent and Energy Efficient Mobile Smartp...,2018.0,2018-12-01,1.131421e+09,2.788469e+09,11,11
299995,2916989951,10.1016/J.CARREV.2019.02.016,Journal,adverse events associated with the use of guid...,Adverse Events Associated with the Use of Guid...,2019.0,2019-05-01,,,13,13
299996,2917251621,10.1594/PANGAEA.776866,,gamma density porosity and susceptibility of s...,"Gamma density, porosity and susceptibility of ...",2009.0,2009-03-01,,,0,0


In [None]:
# ******************* PAPERS ********************

# The Papers CSV is going to be processed in chunks, due to its size

# The column names follow the MAG' scheme official documentation
df_mag_papers_col_names = ['PaperID', 'Rank', 'Doi', 'DocType', 'PaperTitle', 'OriginalTitle', 'BookTitle', 'Year', 'Date', 'OnlineDate', 'Publisher', 'JournalID', 'ConferenceSeriesID', 'ConferenceInstanceID', 'Volume', 'Issue', 'FirstPage', 'LastPage', 'ReferenceCount', 'CitationCount', 'EstimatedCitation', 'OriginalVenue', 'FamilyID', 'FamilyRank', 'CreatedDate']

# List of processed chunks.
df_mag_papers_list_of_chunks = list()

# Define of the chunk size
chunksize = 10 ** 7

count = 1
with pd.read_csv(path_file_import + 'Papers.txt', sep='\t', names=df_mag_papers_col_names, chunksize=chunksize, low_memory=False, on_bad_lines='skip') as reader:
    for chunk in reader:
        print(f'Currently processing chunk {count} out of around 400')
        count += 1

        # Drop of the useless columns
        chunk = chunk.drop(columns=['Rank', 'OnlineDate', 'Publisher', 'Volume', 'Issue', 'FirstPage', 'LastPage', 'ReferenceCount', 'OriginalVenue', 'FamilyID', 'FamilyRank', 'CreatedDate'])

        # Filtering of books and Journals papers
        # TODO

        # Insert of the resulting chunk in the list 
        df_mag_papers_list_of_chunks.append(chunk)

# Concatenation of the processed chunks
df_mag_papers = pd.concat(df_mag_papers_list_of_chunks)

# Empty the list to free some memory
df_mag_papers_list_of_chunks = list()

In [None]:
df_mag_papers

In [None]:
# Export of the partial dataframe
df_mag_papers.to_csv(path_file_export + 'out_mag_citations_partial.csv')
print(f'Successfully Exported the Partial Preprocessed CSV to {path_file_export}out_mag_citations_partial.csv')


In [None]:
# Get All Files' Names
coci_all_csvs = glob.glob(path_file_import + "*.csv")

In [None]:
df_coci_processed = pd.DataFrame(columns=['article', 'citations_count'])

# Combine new data with a partial CSV
if combine_with_partial_csv:
    df_coci_processed = pd.read_csv(partial_csv_path, low_memory=False)
    print(f'Successfully Imported the Partial CSV')
    
# Read, process and concat all CSVs
count = 0
for current_csv_name in coci_all_csvs:

    # Open the current CSV
    print(f'Currently processing CSV {count}: {current_csv_name}')
    count += 1
    df_coci_current_csv = pd.read_csv(current_csv_name, low_memory=False)

    # Drop of the useless columns: 'oci', 'citing', 'creation', 'timespan', 'journal_sc', 'author_sc'
    df_coci_current_csv.drop(columns=['oci', 'citing', 'creation', 'timespan', 'journal_sc', 'author_sc'])

    # Group by cited article and count
    sf_coci_current_grouped = df_coci_current_csv.groupby(['cited'])['cited'].count()

    # Since the returned object is a Pandas Series type, we need to convert it to a Pandas dataframe
    df_coci_current_csv = pd.DataFrame({'article':sf_coci_current_grouped.index, 'citations_count':sf_coci_current_grouped.values})

    ### Concat with the data previously elaborated
    df_coci_processed = pd.concat([df_coci_processed, df_coci_current_csv])

    # Now we need to do a new group by and sum the citations_count to reduce the data
    sf_coci_processed_grouped = df_coci_processed.groupby(['article'])['citations_count'].sum()
    df_coci_processed = pd.DataFrame({'article':sf_coci_processed_grouped.index, 'citations_count':sf_coci_processed_grouped.values})

# Export of the final dataframe
df_coci_processed.to_csv(path_file_export + 'out_coci_citations_count.csv')
print(f'Successfully Exported the Preprocessed CSV to {path_file_export}out_coci_citations_count.csv')


In [None]:
# Check of the Exported CSV
df_coci_exported_csv = pd.read_csv(path_file_export + 'out_coci_citations_count.csv', low_memory=False)
df_coci_exported_csv

In [None]:
# Order by citations count descending to see the articles with the most citations
df_coci_exported_csv = df_coci_exported_csv.sort_values(by='citations_count', ascending=False)
df_coci_exported_csv

In [None]:
# Checking the total count of the citations contained in the extracted CSV
df_coci_exported_csv['citations_count'].sum()