In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import random
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

We take a sample of 1000 rows of the large dblp_article.csv file. We will keep a subset of the attributes that are relevant to the problem at hand.

We remove NAs from the relevant attributes.

We remove duplicate articles.

In [2]:
import pandas as pd

df = pd.read_csv('../../../testData/dblp_article.csv', nrows=1000, delimiter=';', header=None)


headers = ['articleID', 'authors', 'journal', 'number', 'pages', 'title', 'volume', 'year']
headerPos= [0, 1, 15, 22, 23, 29, 33, 34]
df = pd.DataFrame(df.iloc[:, headerPos])
df.columns = headers

df.describe(include='object')
print(df.isna().sum())
df = df.dropna(subset=['articleID', 'authors', 'volume'])
print(df.isna().sum())

df.drop_duplicates(['articleID'], inplace=True)

# df.to_csv('../testData/sampleArticle.csv')

articleID      0
authors        8
journal        0
number       565
pages        562
title          0
volume         4
year           0
dtype: int64
articleID      0
authors        0
journal        0
number       553
pages        550
title          0
volume         0
year           0
dtype: int64


<h2>Random Period</h2>

Calculate random period between 1 and 5 days of a given year. This will be used to calculate the period in the year a conference is held.

In [4]:
def random_date(year):
    # Get first day of that year i.e. 01/01/year
    start_date = dt.date(year, 1, 1)
    
    # Calculate number of days in that year
    days_in_year = (dt.date(year + 1, 1, 1) - start_date).days
    
    # Get the start day of the year. We guarantee that the start day
    # will not be closer than 5 days to the end of the year.
    # An example value would be day 180 of the year
    start_offset = random.randint(0, days_in_year - 5)
    
    # We calculate the duration of the conference edition in days
    end_offset = random.randint(1, 5)
    
    # We calculate the start date given the offset
    start_date = start_date + dt.timedelta(days=start_offset)
    
    # We calculate the end date given the start date and duration of the conference edition
    end_date = start_date + dt.timedelta(days=end_offset)

    # We return the start date and end date of the conference edition
    return (start_date.strftime('%d-%m-%Y'), end_date.strftime('%d-%m-%Y'))


<h2>Conference Data</h2>

We create a dictionary of 5 conferenced for each year from the earliest year in the dataset to 25 years in the future from now.

Each year there is a new edition of each of the conferences. Each edition is held in a random city and period of the year.

In [5]:
# List of years
years = np.arange(start=min(df.year), stop=int(dt.datetime.now().year)+25)

# List of conference names
conference_names = ['ICML', 'NeurIPS', 'AAAI', 'CVPR', 'ACL']

# Dictionary to store year data
year_data = {}


# Loop over each year
for edition, year in enumerate(years):
    
    # Dictionary to store conference data
    conference_data = {}
    
    # Loop over each conference name
    for name in conference_names:
        # Choose a random number of editions between 1 and 10
        num_editions = random.randint(1, 10)
        
        # Create a dictionary to store edition data
        edition_data = {}
        
        # Choose a random city for the edition
        cities = ['San Francisco', 'New York', 'London', 'Paris', 'Tokyo']
        city = random.choice(cities)
        
        # Choose a random start and end date of the edition
        start_date, end_date = random_date(year)
        
        # Add the edition data to the dictionary
        edition_data[name + '_' + str(edition+1)] = {'city': city, 'start_date': start_date, 
                                                'end_date': end_date, 'number': edition+1}
            
        # Add the conference data to the dictionary
        conference_data[name] = edition_data
    
    year_data[year] = conference_data

<h2>Generate data</h2>

In [6]:
# Approximately 50% of the papers will have been presented
# in a conference of the year of publication of that paper
hasConf = df[np.random.choice(a=[False, True], size=len(df))]['articleID']

# Dictionary of topics. Each topic has a set of keywords.
# A random number of keywords from only one topic will be assigned to each paper
keywords = {'database': ['data management', 'indexing', 'data modeling', 'big data', 'data processing', 'data storage', 'data querying'],
            'artificial intelligence': ['machine learning', 'neural networks', 'deep learning', 'natural language processing', 'computer vision', 'reinforcement learning', 'expert systems', 'knowledge representation', 'genetic algorithms', 'bayesian networks'],
            'cybersec' : ['cybersecurity', 'network security', 'information security', 'vulnerability', 'penetration testing', 'threat detection', 'malware analysis', 'security policies', 'risk management', 'cybercrime']
}

abstracts = ['Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla semper lacus sit amet nisl rhoncus tristique ac vel felis.']

authors = df['authors'].apply(lambda x: x.split('|'))
authors = list(authors.explode())

# All papers related to this journal will:
# - Have keywords related to the database community
# - Be cited by papers belonging only to this journal
# This is done to ensure results in all queries of part B and C.
specialJournal = df['journal'].values[0]

# Initialise citations column to avoid problems
df['citations'] = [[] for _ in range(len(df))]

# For each article we assign the missing attributes we need:
# - Citations:
#   - Each paper can have between 0-30 citations
#   - Papers cannot cite themselves
#   - Papers cannot cite eachother. If Paper1 cites Paper2, Paper2 cannot cite Paper1
#   - Papers cannot cite papers from the future. If Paper1 cites Paper2, Paper2 cannot be from 2022 and Paper1 from 1999
# - Keywords
#   - Each paper contains between 2-6 keywords of one topic/theme
# - Corresponding author
#   - Each paper has only one corresponding author
#   - The first author from df['authors'] will be the corresponding author
# - Reviewers
#   - Each paper can have 3-5 reviewers
#   - An author of the paper cannot review that paper
# - Journal ISSNs (id for each of the unique journals)
#   - Unique ID for each journal in dataset
# - Volume ID
#   - Unique ID for each volume in dataset
# - Conference
#   - A paper may or may not have been presented in a conference. Aprox. 50% papers were presented in conferences
# - Edition
# - Edition year
# - Edition ID
# - Edition number
# - Edition City (city)
# - Edition start date
# - Edition end date


for articleIndex, article in df.iterrows():
    keyword   = random.choice(list(keywords.keys()))
    
    # This is to guarantee that there will be at least one journal in the graph which
    # 90% of its papers are associated to the community of database
    # This is done to guarantee that recommender system query returns at least one result.
    #
    # Papers belonging to the specialJournal
    if (df.loc[df['articleID'] == article['articleID'], 'journal'] == specialJournal).all():
        citations = random.choices(df.loc[df['journal'] == specialJournal,'articleID'].values, k=random.randint(0, df['journal'].value_counts()[specialJournal]-1))
        words = random.choices(keywords['database'], k=random.randint(2, 6))
    # Papers not belonging
    else: 
        citations = random.choices(df['articleID'].values, k=random.randint(0, 30))
        words     = random.choices(keywords[keyword], k=random.randint(2, 6))

    # Remove any repeated citations
    citations = list(set(citations))

    
    # Remove the current paper from citations. A paper cannot cite itself
    # Two papers also cannot cite eachother
    for citation in citations:
        if (citation == article['articleID'] or 
            article['articleID'] in df.loc[df['articleID'] == citation, 'citations'].values[0] or 
            article['year'] >= df.loc[df['articleID'] == citation, 'year'].values[0]):
            
            citations.remove(citation)
            

    # From the current paper being processed, the first author in the list will be
    # the corresponding author
    df.loc[df['articleID'] == article['articleID'], 'correspondingAuthor'] = random.choice(list(df[df['articleID'] == article['articleID']]['authors'].str.split('|'))[0])
    
    # Each article has a random number of keywords of the same theme/topic
    df.loc[df['articleID'] == article['articleID'], 'keywords'] = '|'.join(words)

    
    if len(citations) > 0:
        # Each article will cite a random number of other articles
        df.at[articleIndex, 'citations'] = citations 
    
    reviewers = random.choices(authors, k=random.randint(3, 5))
    if article['articleID'] in hasConf.values:
        temp_conference = random.choice(conference_names)
        temp_year = int(int(df[df['articleID'] == article['articleID']]['year']))
        temp_edition = list(year_data[temp_year][temp_conference].keys())[0]

        df.loc[df['articleID'] == article['articleID'], 'edition_year'] = temp_year
        df.loc[df['articleID'] == article['articleID'], 'conference'] = temp_conference
        df.loc[df['articleID'] == article['articleID'], 'edition_id'] = temp_edition
        df.loc[df['articleID'] == article['articleID'], 'edition_number'] = year_data[temp_year][temp_conference][temp_edition]['number']
        df.loc[df['articleID'] == article['articleID'], 'city'] = year_data[temp_year][temp_conference][temp_edition]['city']
        df.loc[df['articleID'] == article['articleID'], 'start_date'] = year_data[temp_year][temp_conference][temp_edition]['start_date']
        df.loc[df['articleID'] == article['articleID'], 'end_date'] = year_data[temp_year][temp_conference][temp_edition]['end_date']

    # Remove the author of current article. An author cannot review their own articles
    for reviewer in reviewers:
        if reviewer in article['authors']: reviewers.remove(reviewer)
    
    # Each article has a random number of keywords of the same theme/topic
    df.loc[df['articleID'] == article['articleID'], 'reviewers'] = '|'.join(reviewers)

    # Each paper has a random abstract
    df.loc[df['articleID'] == article['articleID'], 'abstract'] = random.choice(abstracts)

# Confirm that papers cannot cite eachother. If Paper1 cites Paper2, Paper2 cannot cite Paper1
for articleIndex, article in df.iterrows():
    citations = article['citations']
    for citation in citations:
        if (article['articleID'] in df.loc[df['articleID'] == citation, 'citations'].values[0]):
            citations.remove(citation)
    df.at[articleIndex, 'citations'] = citations

# Format citaions to follow the same structure as authors and reviewers
# Example: ciation1|citation2|citation3
df['citations'] = df['citations'].apply(lambda x: '|'.join(str(i) for i in x))

# We generate random unique ids for each of the journals
journalNames = np.unique(df['journal'].values)
repeat = True
while repeat:
    journalISSNS = np.random.randint(low=10000000, high=99999999, size=len(journalNames))
    repeat = len(journalISSNS) != len(np.unique(journalISSNS))

# We assign an issn to each journal and a volume id to each of its volumes
for i, journal in enumerate(journalNames):
    df.loc[df['journal'] == journal, 'issn'] = str(journalISSNS[i])
    df.loc[df['journal'] == journal, 'volume_id'] = str(journalISSNS[i]) + '-' + df[df['journal'] == journal]['volume']
    


In [7]:
df.to_csv('../../../testData/sampleArticle.csv')