## Imports

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import random
import uuid
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

## People Dataset

We read our reference dataset

In [2]:
df = pd.read_csv('../../lab1/data/sampleArticle.csv')

We randomly determine if a person is an author, reviewer, chairperson or editor. In our reality, at the moment, we do not consider that a person can be two things at the same time.

In [3]:
people_types = {'Author': 0.5, 'Reviewer': 0.3, 'Editor': 0.1, 'Chairperson': 0.1}
def randomPersonType(_):
    choice = np.random.choice(list(people_types.keys()), p=list(people_types.values()))
    return choice

In [4]:
people = np.unique(df['correspondingAuthor'].values)
people_ids = [uuid.uuid4().hex for _ in range(len(people))]

people_df = pd.DataFrame({'Name': people, 'ID': people_ids})
people_df['Type'] = ''
people_df['Type'] = people_df.apply(randomPersonType, axis=1)

people_df

Unnamed: 0,Name,ID,Type
0,A. Battermann,84840ca3f95546298476ddf23fb840e7,Author
1,Aaron Pratt Shepherd,97ac227865a64c9baea6e5d7458bf343,Reviewer
2,Abdelouahed Hamdi,73a4935f93514f36a9c5a8e695e4a3fc,Reviewer
3,Aceil Al-Khatib,ad51da81a4d64a40a3f53b5d6640e5ec,Reviewer
4,Adam Briggle,6b48250a5f164bdc9ba00956bc3c49b4,Author
...,...,...,...
658,Yotam Lurie,5845c49a143c49fe9edb529479d09058,Author
659,Yuan-Hsuan Lee 0001,163b173a3bc7474e907a51bbf9a640be,Editor
660,Yue Liang Zheng,5f6edb4530a44a4890dfba06f000fb85,Reviewer
661,Zach Piso,367939b9397344f08a901295b10c14b2,Author


### Exporting People Dataset

In [5]:
people_df.to_csv('../data/people.csv')

## Area Dataset

In [6]:
areas = ['MachineLearning', 'Databases', 'NaturalLanguageProcessing', 'ArtificialIntelligence']
keywords = [['supervised learning', 'unsupervised learning', 'deep learning', 'neural networks', 'data mining'],
            ['relational database', 'SQL', 'NoSQL', 'data modeling', 'database design'],
            ['text processing', 'sentiment analysis', 'language modeling', 'information retrieval', 'part-of-speech tagging'],
            ['knowledge representation', 'expert systems', 'robotics', 'computer vision', 'cognitive computing']]

area_df = pd.DataFrame({'ID': areas, 'Keywords': keywords})
area_df


Unnamed: 0,ID,Keywords
0,MachineLearning,"[supervised learning, unsupervised learning, deep learning, neural networks, data mining]"
1,Databases,"[relational database, SQL, NoSQL, data modeling, database design]"
2,NaturalLanguageProcessing,"[text processing, sentiment analysis, language modeling, information retrieval, part-of-speech tagging]"
3,ArtificialIntelligence,"[knowledge representation, expert systems, robotics, computer vision, cognitive computing]"


### Exporting Area Dataset

In [7]:
area_df.to_csv('../data/area.csv')

## Venue Dataset

In [8]:
conference_types = {'RegularConference': 0.5, 'Workshop': 0.3, 'Symposium': 0.1, 'ExpertGroups': 0.1}
def randomConferenceType(_):
    choice = np.random.choice(list(conference_types.keys()), p=list(conference_types.values()))
    return choice

In [9]:
area_df = pd.read_csv('../data/area.csv')
def randomArea(_):
    choice = np.random.choice(area_df['ID'])
    return choice

In [10]:
people_df = pd.read_csv('../data/people.csv')
def randomResponsible(row):
    if row['Type'] == 'Conference':
        options = people_df.loc[people_df['Type'] == 'Chairperson', 'ID'].values
    else:
        options = people_df.loc[people_df['Type'] == 'Editor', 'ID'].values

    # print(options)

    choice = np.random.choice(options)

    res = people_df.loc[people_df['ID'] == choice, ['Name', 'ID']]

    # print(res.values)

    return res.values[0]


In [11]:
# Real data for conferences
conference_data = {'Type': ['Conference'] * 15,
                   'ID': ['CVPR', 'NeurIPS', 'ICML', 'ACL', 'EMNLP', 'AAAI', 'ICLR', 'COLT', 'SIGGRAPH', 'SIGCOMM', 'INFOCOM', 'MobiCom', 'UbiComp', 'CHI', 'UIST'],
                   'Name': ['IEEE/CVF Conference on Computer Vision and Pattern Recognition',
                            'Conference on Neural Information Processing Systems',
                            'International Conference on Machine Learning',
                            'Association for Computational Linguistics',
                            'Empirical Methods in Natural Language Processing',
                            'Association for the Advancement of Artificial Intelligence',
                            'International Conference on Learning Representations',
                            'Conference on Learning Theory',
                            'Special Interest Group on Computer Graphics and Interactive Techniques Conference',
                            'Special Interest Group on Data Communications Conference',
                            'IEEE International Conference on Computer Communications',
                            'ACM International Conference on Mobile Computing and Networking',
                            'International Joint Conference on Pervasive and Ubiquitous Computing',
                            'Conference on Human Factors in Computing Systems',
                            'ACM Symposium on User Interface Software and Technology']}

# Real data for journals
journal_data = {'Type': ['Journal'] * 15,
                'ID': ['JMLR', 'IEEE-TKDE', 'IEEE-TPAMI', 'ACM-TIST', 'JASIST', 'IEEE-DS', 'ACM-CSUR', 'TOIS', 'IJCV', 'JASA', 'TPAMI', 'MLJ', 'TNNLS', 'JML', 'AI'],
                'Name': ['Journal of Machine Learning Research',
                         'IEEE Transactions on Knowledge and Data Engineering',
                         'IEEE Transactions on Pattern Analysis and Machine Intelligence',
                         'ACM Transactions on Intelligent Systems and Technology',
                         'Journal of the Association for Information Science and Technology',
                         'IEEE Data Science and Engineering',
                         'ACM Computing Surveys',
                         'ACM Transactions on Information Systems',
                         'International Journal of Computer Vision',
                         'Journal of the American Statistical Association',
                         'IEEE Transactions on Pattern Analysis and Machine Intelligence',
                         'Machine Learning Journal',
                         'IEEE Transactions on Neural Networks and Learning Systems',
                         'Journal of Machine Learning',
                         'Artificial Intelligence Journal']}

# Concatenate the dataframes and shuffle the rows
venue_df = pd.concat([pd.DataFrame(conference_data), pd.DataFrame(journal_data)])
venue_df = venue_df.sample(frac=1).reset_index(drop=True)

# Assigning conference type to venues which are conferences
venue_df.loc[venue_df['Type'] == 'Conference', 'ConferenceType'] = venue_df.apply(randomConferenceType, axis=1)

venue_df['Area'] = venue_df.apply(randomArea, axis=1)

venue_df[['ResponsibleName', 'ResponsibleID']] = venue_df.apply(randomResponsible, axis=1, result_type='expand')

venue_df


Unnamed: 0,Type,ID,Name,ConferenceType,Area,ResponsibleName,ResponsibleID
0,Conference,ICML,International Conference on Machine Learning,Workshop,Databases,William Bülow,d0242381fb9c472f8397a9914c26f80a
1,Conference,ICLR,International Conference on Learning Representations,Symposium,NaturalLanguageProcessing,David G. Larman,10b31ee210d64c40b17d48fafec4aa5c
2,Journal,JML,Journal of Machine Learning,,MachineLearning,Hugh P. Shanahan,c931ed8f1b8b4dedb3c82b7c9a19083b
3,Journal,TNNLS,IEEE Transactions on Neural Networks and Learning Systems,,Databases,Marty J. Wolf,024bb4fa0c2d4f28b15353bdf9f1d6fc
4,Conference,EMNLP,Empirical Methods in Natural Language Processing,RegularConference,Databases,David G. Larman,10b31ee210d64c40b17d48fafec4aa5c
5,Conference,SIGGRAPH,Special Interest Group on Computer Graphics and Interactive Techniques Conference,RegularConference,Databases,Frank Manola,ad8060f4684542f9bc6d0b52a2ce8229
6,Conference,UbiComp,International Joint Conference on Pervasive and Ubiquitous Computing,ExpertGroups,Databases,Ralph J. Greenspan,264ab3cc58eb4c2f94567c1e719181de
7,Journal,JASA,Journal of the American Statistical Association,,ArtificialIntelligence,Raheleh Heidari Feidt,a00da097d1604fc2ad1c497e2ac4ee5a
8,Journal,JASIST,Journal of the Association for Information Science and Technology,,MachineLearning,Tankred Rautert,e24250e44b664f7e99aa3fb10944ad6c
9,Journal,IEEE-TPAMI,IEEE Transactions on Pattern Analysis and Machine Intelligence,,NaturalLanguageProcessing,Helmut Seidl,ab7216a18d054bfdbf06472b472bef11


In [12]:
venue_df.to_csv('../data/venue.csv')

## Volumes Dataset

In [13]:
journals = pd.read_csv('../data/venue.csv')

journals = np.unique(journals.loc[journals['Type'] == 'Journal', 'ID'].values)

# create a list of possible volume names
journal_ids = []
volume_names = []
years = []
for journal_name in journals:
    # generate 10 possible volume names for each journal name
    start_year = random.randint(1995, 2005)
    for i in range(1, 11):
        journal_ids.append(journal_name)
        volume_names.append(journal_name + '-Volume-' + str(i))
        years.append(start_year+i)

# create a pandas dataframe from the list of volume names
volume_df = pd.DataFrame({'JournalID': journal_ids, 'VolumeID': volume_names, 'volumeYear': years})

volume_df

Unnamed: 0,JournalID,VolumeID,volumeYear
0,ACM-CSUR,ACM-CSUR-Volume-1,2004
1,ACM-CSUR,ACM-CSUR-Volume-2,2005
2,ACM-CSUR,ACM-CSUR-Volume-3,2006
3,ACM-CSUR,ACM-CSUR-Volume-4,2007
4,ACM-CSUR,ACM-CSUR-Volume-5,2008
5,ACM-CSUR,ACM-CSUR-Volume-6,2009
6,ACM-CSUR,ACM-CSUR-Volume-7,2010
7,ACM-CSUR,ACM-CSUR-Volume-8,2011
8,ACM-CSUR,ACM-CSUR-Volume-9,2012
9,ACM-CSUR,ACM-CSUR-Volume-10,2013


In [14]:
volume_df.to_csv('../data/volume.csv')

## Proceedings Dataset

In [15]:
proceedings = pd.read_csv('../data/venue.csv')

proceedings = np.unique(proceedings.loc[proceedings['Type'] == 'Conference', 'ID'].values)

# create a list of possible volume names
conference_ids = []
proceeding_names = []
years = []
for proceeding_name in proceedings:
    # generate 10 possible volume names for each journal name
    start_year = random.randint(1995, 2005)
    for i in range(1, 11):
        conference_ids.append(proceeding_name)
        proceeding_names.append(proceeding_name + '-Proceeding-' + str(i))
        years.append(start_year+i)

# create a pandas dataframe from the list of volume names
proceeding_df = pd.DataFrame({'ConferenceID': conference_ids, 'ProceedingID': proceeding_names, 'proceedingsYear': years})

proceeding_df

Unnamed: 0,ConferenceID,ProceedingID,proceedingsYear
0,AAAI,AAAI-Proceeding-1,1999
1,AAAI,AAAI-Proceeding-2,2000
2,AAAI,AAAI-Proceeding-3,2001
3,AAAI,AAAI-Proceeding-4,2002
4,AAAI,AAAI-Proceeding-5,2003
5,AAAI,AAAI-Proceeding-6,2004
6,AAAI,AAAI-Proceeding-7,2005
7,AAAI,AAAI-Proceeding-8,2006
8,AAAI,AAAI-Proceeding-9,2007
9,AAAI,AAAI-Proceeding-10,2008


In [16]:
proceeding_df.to_csv('../data/proceedings.csv')

## Paper Dataset

This dataset will contain papers that are submitted to venues. It will also include published papers. We will deal with making sure these publications are conforming to the constraints later. The constraint is that they must be accepted by at least two reviewers.

In [17]:
author_df = pd.read_csv('../data/people.csv')
author_df = author_df.loc[author_df['Type'] == 'Author', ['Name', 'ID']]

def randomAuthor(_):
    choice = np.random.choice(author_df['ID'].values)

    # print(choice)

    choice = author_df.loc[author_df['ID'] == choice]

    return choice['Name'].values[0], choice['ID'].values[0]

In [18]:
venue_df = pd.read_csv('../data/venue.csv')

def randomVenue(_):
    choice = np.random.choice(venue_df['ID'].values)
    choice = venue_df.loc[venue_df['ID'] == choice]

    return choice['Name'].values[0], choice['ID'].values[0], choice['Type'].values[0], choice['ConferenceType'].values[0], choice['Area'].values[0]

In [19]:
def randomPaperType(type):
    if type == 'Conference':
        return np.random.choice(['FullPaper', 'ShortPaper', 'DemoPaper', 'Poster'], p=[0.4, 0.4, 0.15, 0.05])
    else:
        return np.random.choice(['FullPaper', 'ShortPaper', 'DemoPaper'], p=[0.4, 0.4, 0.2])

In [20]:
proceedings_df = pd.read_csv('../data/proceedings.csv')
volume_df = pd.read_csv('../data/volume.csv')

def randomPublishedIn(row):
    if row['Published']:

        if row['VenueType'] == 'Conference':
            choices = proceedings_df.loc[proceedings_df['ConferenceID'] == row['VenueID'], 'ProceedingID']
            choice = np.random.choice(choices.values)
        else:
            choices = volume_df.loc[volume_df['JournalID'] == row['VenueID'], 'VolumeID']
            choice = np.random.choice(choices.values)

        return choice
    return ''

In [21]:
paper_titles = np.unique(df['title'].values)
paper_ids = [uuid.uuid4().hex for _ in range(len(paper_titles))]

paper_df = pd.DataFrame({'ID': paper_ids, 'Title': paper_titles})

paper_df[['Author', 'AuthorID']] = paper_df.apply(randomAuthor, axis=1, result_type='expand')
paper_df[['Venue', 'VenueID', 'VenueType', 'ConferenceType', 'Area']] = paper_df.apply(randomVenue, axis=1, result_type='expand')

paper_df['Published'] = np.random.choice([True, False], size=len(paper_df))

paper_df['PublishedIn'] = paper_df.apply(randomPublishedIn, axis=1)

paper_df['PaperType'] = paper_df['VenueType'].apply(randomPaperType)

In [22]:
paper_df

Unnamed: 0,ID,Title,Author,AuthorID,Venue,VenueID,VenueType,ConferenceType,Area,Published,PublishedIn,PaperType
0,4b817798ae93460b927ad91a2f268027,"""'Once more unto the Breach': Professional Responsibility and Computer Ethics"" - A Response to: ""A Critique of Positive Responsibility in Computing"".",Douglas Walton,2f8bc8d0da6543bebad91a01bf4b9603,ACM Symposium on User Interface Software and Technology,UIST,Conference,RegularConference,ArtificialIntelligence,False,,FullPaper
1,8e5572c3954b4af4a007efc6f1f49de6,"""An Ethics of Commitment for Engineers"".",Petra Steffens,ca68767de329494380b3e340f6d12060,Journal of the Association for Information Science and Technology,JASIST,Journal,,MachineLearning,False,,FullPaper
2,520adc97848d463d9a62e3887c9e36de,"""Broader Impacts"" or ""Responsible Research and Innovation""? A Comparison of Two Criteria for Funding Research in Science and Engineering.",Lothar Breuer,8d3758f8613c47b6911c755601c10ca0,Journal of the American Statistical Association,JASA,Journal,,ArtificialIntelligence,False,,DemoPaper
3,c32ef10d6ee240518de615e75c8f74a0,"""Der Film öded das Publikum"" - Zum Aufbau eines maschinenlesbaren Wörterbuchs für deutsche Verben",Knut Jørgen Vie,3a44c271c0a44bc2b2acfa8fd83df557,International Journal of Computer Vision,IJCV,Journal,,NaturalLanguageProcessing,True,IJCV-Volume-10,DemoPaper
4,38a7cfbf99c440b6869f063af9f01a54,"""Dictionary Dialog"" - Entwurf des Funktionsumfangs für eine Benutzerschnittstelle eines integrierten maschinellen/maschinenunterstützten Übersetzungssystems und prototypische Erstellung der Bildschirmfolge für die Funktion ""Semantische Relation""",Don Gotterbarn,7af510c5c64e4221ba3f69376d18ebee,Association for Computational Linguistics,ACL,Conference,Workshop,NaturalLanguageProcessing,True,ACL-Proceeding-1,FullPaper
...,...,...,...,...,...,...,...,...,...,...,...,...
981,5a063cc97dfd4c029cfd975bf4ec4b46,tele-TASK - Teleteaching Anywhere Solution Kit,Klaus Lux,1e9439ef6df9446c969098b83535ebe5,IEEE Transactions on Knowledge and Data Engineering,IEEE-TKDE,Journal,,NaturalLanguageProcessing,True,IEEE-TKDE-Volume-6,FullPaper
982,2efe0993224d4ec9a06de661c6c04036,tele-TASK - Teleteaching praxistauglich für den Universitätsalltag,Mehmet Aközer,eff3bfc5fb024deba7e130d740b87449,International Joint Conference on Pervasive and Ubiquitous Computing,UbiComp,Conference,ExpertGroups,Databases,True,UbiComp-Proceeding-5,ShortPaper
983,3c7ced5c3da0407fbcd670f0d5609570,Über Ansätze zur Darstellung von Konzepten und Prototypen,Burkhard Kehrbusch,428622834e69409da0d116a5dec4a71a,Conference on Human Factors in Computing Systems,CHI,Conference,RegularConference,Databases,True,CHI-Proceeding-5,FullPaper
984,c17c6986665f4d13be9957388de21bbd,Über Vergleichskonstruktionen,Dorthie Cross,89725ca6d2234ef2a5c28566a3c031dc,Conference on Neural Information Processing Systems,NeurIPS,Conference,ExpertGroups,NaturalLanguageProcessing,True,NeurIPS-Proceeding-8,ShortPaper


In [23]:
paper_df.to_csv('../data/paper.csv')

## Reviews Dataset

In [24]:
venue_df = pd.read_csv('../data/venue.csv')
paper_df = pd.read_csv('../data/paper.csv')

def getResponsibleId(paper):
    venue = paper_df.loc[paper_df['ID'] == paper, 'Venue']
    responsible = venue_df.loc[venue_df['Name'] == venue.values[0], 'ResponsibleID']

    return responsible.values[0]

def getDecision(paper):
    decision = paper_df.loc[paper_df['ID'] == paper, 'Published']
    return decision.values[0]


review_texts = {
    "Good": [
        "This paper is well-written and provides valuable insights.",
        "The experiments are well-designed and the results are statistically significant.",
        "The authors have addressed all of the reviewers' comments and improved the paper significantly.",
        "The paper presents a novel contribution to the field and is likely to have a high impact.",
        "The writing is clear and concise, making the paper easy to understand.",
    ],
    "Bad": [
        "The paper is poorly written and difficult to follow.",
        "The experiments are poorly designed and the results are not statistically significant.",
        "The authors have not adequately addressed the reviewers' comments, and the paper has not improved significantly.",
        "The paper does not make a significant contribution to the field and is not likely to have a high impact.",
        "The writing is unclear and difficult to understand, making the paper hard to follow.",
    ]
}

def getReviewText(decision):
    if decision:
        choice = np.random.choice(review_texts['Good'])
    else:
        choice = np.random.choice(review_texts['Bad'])
    
    return choice


In [25]:
paperTitles = pd.read_csv('../data/paper.csv')
paperTitles = np.unique(paperTitles['ID'].values)

reviewers = pd.read_csv('../data/people.csv')
reviewers = np.unique(reviewers.loc[reviewers['Type'] == 'Reviewer', 'ID'].values)

# create a dictionary to store the data
data = {'ReviewID': [], 'PaperID': [], 'ReviewerID': []}

# iterate over each paper name and assign a random set of 2 to 5 reviewers
for paper_name in paperTitles:
    # randomly select the number of reviewers for this paper (2 to 5)
    num_reviewers = np.random.randint(2, 6)
    # randomly select reviewers for this paper without replacement
    paper_reviewers = np.random.choice(reviewers, size=num_reviewers, replace=False)
    # add the paper-reviewer pairs to the dictionary
    for reviewer in paper_reviewers:
        data['ReviewID'].append(uuid.uuid4().hex)
        data['PaperID'].append(paper_name)
        data['ReviewerID'].append(reviewer)

# create a pandas dataframe from the dictionary
review_df = pd.DataFrame(data)

# We retrieve the responsible of the venue in which it was submitted
review_df['ResponsibleID'] = review_df['PaperID'].apply(getResponsibleId)

review_df['Decision'] = review_df['PaperID'].apply(getDecision)
review_df['ReviewText'] = review_df['Decision'].apply(getReviewText)

In [26]:
review_df

Unnamed: 0,ReviewID,PaperID,ReviewerID,ResponsibleID,Decision,ReviewText
0,21255096ed5f4411a0bad97bc7603b89,0022f58487cd4d048a60e3f0835c4b37,b3fa6cb31ee2433ba1ec61ae51a25ca4,9b49622bcb6a4419ae11297e92ac4034,True,The authors have addressed all of the reviewers' comments and improved the paper significantly.
1,b4bb533329484335bb05cfd01375000b,0022f58487cd4d048a60e3f0835c4b37,d605e52b1e9b4d64a4b89f928c86e35a,9b49622bcb6a4419ae11297e92ac4034,True,"The writing is clear and concise, making the paper easy to understand."
2,3025b88bfc0444c486a657d4e43fced9,0022f58487cd4d048a60e3f0835c4b37,7463d194788843ab8e3a506dccfa142f,9b49622bcb6a4419ae11297e92ac4034,True,The experiments are well-designed and the results are statistically significant.
3,938e13fb85c647529b7051e5acde0c94,0022f58487cd4d048a60e3f0835c4b37,3112b9f941af49f2accd12414e380bf9,9b49622bcb6a4419ae11297e92ac4034,True,The paper presents a novel contribution to the field and is likely to have a high impact.
4,4e07b0859b794ae89f8008abdaee0b58,0022f58487cd4d048a60e3f0835c4b37,2dfc5141adb6400192c2049b87311ca4,9b49622bcb6a4419ae11297e92ac4034,True,"The writing is clear and concise, making the paper easy to understand."
...,...,...,...,...,...,...
3467,fff519e50b98497d8e5116120b5c0e1a,fe878e9b728b4e79951a0ee72a59596a,0fb5aefda6884cefaed1364e81478c6f,f130f159e7f340d18e5f8f8d3f3fa47f,True,This paper is well-written and provides valuable insights.
3468,99bf75483a4d4224b8c11eb4db38ed85,fe878e9b728b4e79951a0ee72a59596a,5ebb05e58f204620a5c6d9969709d0b8,f130f159e7f340d18e5f8f8d3f3fa47f,True,"The writing is clear and concise, making the paper easy to understand."
3469,87f14fcd49214eafaaea674e77bd71f3,febc52e6ce44403eb453d938727bf289,f673c6fdf08e45f3b3862fc11c28bc57,f03d1c7f6c36471da0ac4055584b67fa,True,The paper presents a novel contribution to the field and is likely to have a high impact.
3470,195b50ac9d76410c9c1bedf1e4d96ebc,febc52e6ce44403eb453d938727bf289,32c7e38a3ae54d609e59d05036788258,f03d1c7f6c36471da0ac4055584b67fa,True,The authors have addressed all of the reviewers' comments and improved the paper significantly.


In [27]:
review_df.to_csv('../data/review.csv')