In [79]:
import pandas as pd
import numpy as np
import random
from lorem_text import lorem

In [80]:
articles = pd.read_csv('article.csv', sep=';')
proceeding = pd.read_csv('proceeding_papers.csv', sep=';')

In [81]:
def generate_reviews(x):
    rev = x.split('|')
    reviews = ''
    for r in rev:
        if len(reviews) > 0:
            reviews += '|'
        reviews += lorem.words(10)
    return reviews

def generate_decisions(x):
    rev = x.split('|')
    decisions = ''
    for r in rev:
        if len(decisions) > 0:
            decisions += '|'
        decisions += 'Accepted'
    return decisions

def remove_space(x):
    if type(x) == float:
        return 'TheRiddler'
    else:
        return x.replace(' ','')

def generate_decisionIDs(art, rev):
    reviewers = rev.split('|')
    IDs = ''
    for i, r in enumerate(reviewers):
        if len(IDs) > 0:
            IDs += '|'
        IDs += f"review{art}_{i}" 
    return IDs

In [82]:
proceeding = proceeding.rename(columns = {'booktitle': 'conference',
                                          'inproctitle': 'articleTitle', 
                                          'proctitle': 'conferenceTitle',
                                          'keywords': 'topic',
                                          'inprocee': 'doi',
                                          'proceedings': 'conferenceID',
                                          'inproceedings': 'articleID'
                                         })



proceeding['conferenceID'] = proceeding.groupby(by=['conference', 'procmdate', 'conferenceTitle']).ngroup().add(1)
confID = proceeding['conferenceID'].unique()
authors = proceeding['author'].unique()

a = set([])
for aut in authors:
    if type(aut) == float:
        continue
    for au in aut.split('|'):
        a.add(au)

names = set([])
for author in a:
    for name in author.split(' '):
        names.add(name)

confNamePair = {}
confTypePair = {}
confType = ['workshop', 'symposium', 'expert_group', 'regular']
for ID in confID: 
    confNamePair[ID] = f'{random.sample(names, k=1)[0]} {random.sample(names, k=1)[0]}'
    confTypePair[ID] = random.sample(confType, k=1)[0]
    
def get_author(x):
    return confNamePair[x]

def get_type(x):
    return confTypePair[x]

proceeding['chair'] = proceeding['conferenceID'].apply(get_author)
proceeding['conferenceType'] = proceeding['conferenceID'].apply(get_type)

pro = set(proceeding['articleID'].unique())
confPaperTypePair = {}
confPaperTypes = ['poster', 'full_paper', 'demo_paper', 'short_paper']
for artID in pro:
    confPaperTypePair[artID] = random.sample(confPaperTypes, k=1)[0]
    
def get_paper_type(x):
    return confPaperTypePair[x]

proceeding['articleType'] = proceeding['articleID'].apply(get_paper_type)

proceeding['reviews'] = proceeding['reviewed_by'].apply(generate_reviews)
proceeding['decisions'] = proceeding['reviewed_by'].apply(generate_decisions)
proceeding['decisionID'] = proceeding.apply(lambda x: generate_decisionIDs(x.articleID, x.reviewed_by), axis=1)

proceeding['author'] = proceeding['author'].apply(remove_space)
proceeding['conference'] = proceeding['conference'].apply(remove_space)
proceeding['reviewed_by'] = proceeding['reviewed_by'].apply(remove_space)
proceeding['chair'] = proceeding['chair'].apply(remove_space)


proceeding = proceeding.drop(['volume',
                                 'url', 
                                 'author-orcid',
                                 'ee-type', 
                                 'pages', 
                                 'inprocmdate', 
                                 'author-orcid', 
                                 'abstract', 
                                 'co_authors',
                                 'citations',
                                 'isbn',
                                 'series',
                                 'publisher',
                                 'doi',
                                 'crossref',
                                 'key',
                                 'procee',
                                 'corresponding', 
                                 'procmdate', 
                                 'year'], 1)

proceeding.to_csv('proceeding_processed.csv')
proceeding[:5].to_csv('proceeding_slice.csv')

In [83]:
articles = articles.rename(columns = {'keywords': 'topics'})

articles['journalID'] = articles.groupby(by=['journal', 'volume']).ngroup().add(1)
jourID = articles['journalID'].unique()
authors = articles['author'].unique()

a = set([])
for aut in authors:
    if type(aut) == float:
        continue
    for au in aut.split('|'):
        a.add(au)

names = set([])
for author in a:
    for name in author.split(' '):
        names.add(name)
        
jourNamePair = {}
for ID in jourID:
    jourNamePair[ID] = f'{random.sample(names, k=1)[0]} {random.sample(names, k=1)[0]}'

def get_author(x):
    return jourNamePair[x]

articles['editor'] = articles['journalID'].apply(get_author)    


art = set(articles['article'].unique())
    
jourPaperTypePair = {}

jourPaperTypes = ['full_paper', 'demo_paper', 'short_paper']
for artID in art:
    jourPaperTypePair[artID] = random.sample(jourPaperTypes, k=1)[0]

def get_paper_type(x):
    return jourPaperTypePair[x]    

articles['paperType'] = articles['article'].apply(get_paper_type)

articles['reviews'] = articles['reviewed_by'].apply(generate_reviews)
articles['decisions'] = articles['reviewed_by'].apply(generate_decisions)
articles['decisionID'] = articles.apply(lambda x: generate_decisionIDs(x.article, x.reviewed_by), axis=1)

articles['author'] = articles['author'].apply(remove_space)
articles['journal'] = articles['journal'].apply(remove_space)
articles['reviewed_by'] = articles['reviewed_by'].apply(remove_space)
articles['editor'] = articles['editor'].apply(remove_space)


articles = articles.drop(['key',
                          'author-orcid',
                          'ee',
                          'ee-type',
                          'pages',
                          'publtype',
                          'abstract',
                          'citations',
                          'url',
                          'key',
                          'corresponding',
                          'co_authors',
                          'correspondingID',
                          'mdate', 
                          'year'],1)

articles.to_csv('articles_processed.csv')
articles[:5].to_csv('article_slice.csv')

In [95]:
pro

Unnamed: 0,articleID,author,conference,articleTitle,year,reviewed_by,conferenceID,conferenceTitle,location,topic,chair,conferenceType,articleType,reviews,decisions,decisionID
0,7043780,JoxeGaintzarain|MarisaNavarro|MontserratHermo,MFCS,Goals in the Propositional Horn Language Are M...,2005,HarishChander|HuiZhang0001|MinLiu0008,1283,Mathematical Foundations of Computer Science 1...,Oosterbeek,pariatur|ea|data modeling|data querying,ClimerNinghui,workshop,demo_paper,cum omnis aliquid ad quibusdam dolorum assumen...,Accepted|Accepted|Accepted,review7043780_0|review7043780_1|review7043780_2
4,7042963,DaniëlPaulusma|GiacomoPaesani|KonradK.Dabrowsk...,MFCS,"On the Price of Independence for Vertex Cover,...",2018,KiaDashtipour|AlexanderV.Terekhov|N.Kelly-Boxall,1283,Mathematical Foundations of Computer Science 1...,Oosterbeek,praesentium|at|data management|data storage,ClimerNinghui,workshop,full_paper,nesciunt nam id repellendus ab provident maxim...,Accepted|Accepted|Accepted,review7042963_0|review7042963_1|review7042963_2
8,7044560,WitoldLipskiJr.,MFCS,Combinatorial Aspects of Information Storage a...,1974,MarkGrechanik|JohnR.Hayes|UlrichLang0002|ShuYang,1283,Mathematical Foundations of Computer Science 1...,Oosterbeek,exercitationem|fuga|data storage|data modeling,ClimerNinghui,workshop,poster,ab facilis porro quisquam beatae tempore provi...,Accepted|Accepted|Accepted|Accepted,review7044560_0|review7044560_1|review7044560_...
12,7045072,DanielSilvaGraça|EmmanuelHainry|OlivierBournez,MFCS,Robust Computations with Dynamical Systems.,2010,YanyuMu|LaurentSchmalen|HuangpingJin|Christian...,1283,Mathematical Foundations of Computer Science 1...,Oosterbeek,suscipit|perferendis|big data|data querying,ClimerNinghui,workshop,demo_paper,est voluptatum laudantium enim quibusdam earum...,Accepted|Accepted|Accepted|Accepted,review7045072_0|review7045072_1|review7045072_...
16,7043691,AndréGronemeier,MFCS,NOF-Multiparty Information Complexity Bounds f...,2006,XiaopingShi|RafaelDeLeon|JingDeng0001,1283,Mathematical Foundations of Computer Science 1...,Oosterbeek,tempore|doloremque|big data|data storage,ClimerNinghui,workshop,poster,nesciunt quo ad doloremque quidem praesentium ...,Accepted|Accepted|Accepted,review7043691_0|review7043691_1|review7043691_2
20,7045354,SlawomirLasota0001|WojciechRytter,MFCS,Faster Algorithm for Bisimulation Equivalence ...,2006,GeraldP.Duggan,1283,Mathematical Foundations of Computer Science 1...,Oosterbeek,at|qui|indexing|data storage,ClimerNinghui,workshop,poster,officiis quam corrupti repudiandae quod error ...,Accepted,review7045354_0
24,7045617,HubieChen|MartinPál,MFCS,"Optimization, Games, and Quantified Constraint...",2004,MartynaGatkowska|MiaoJiang0002,1283,Mathematical Foundations of Computer Science 1...,Oosterbeek,nulla|doloremque|data management|data processing,ClimerNinghui,workshop,full_paper,corporis quaerat quis dolores cum tempore dolo...,Accepted|Accepted,review7045617_0|review7045617_1
28,7044979,AmosFiat,MFCS,Some Recent Results on Data Mining and Search.,2001,Chung-ShiTseng|ZhaoZhang0001|AntonioTammaro,1283,Mathematical Foundations of Computer Science 1...,Oosterbeek,id|facilis|data processing|data querying,ClimerNinghui,workshop,poster,minus pariatur dolorum facilis nobis ad neque ...,Accepted|Accepted|Accepted,review7044979_0|review7044979_1|review7044979_2
32,7045768,DordeZikelic|GuyAvni|ThomasA.Henzinger,MFCS,Bidding Mechanisms in Graph Games.,2019,JuanJoséCabezas,1283,Mathematical Foundations of Computer Science 1...,Oosterbeek,praesentium|vero|indexing|data querying,ClimerNinghui,workshop,demo_paper,amet ab optio quod distinctio accusamus error ...,Accepted,review7045768_0
36,7043263,GerhardBarth,MFCS,Mastering Contextsensitivity in Programming La...,1978,GonzaloNavarro,1283,Mathematical Foundations of Computer Science 1...,Oosterbeek,quasi|similique|data storage|data management,ClimerNinghui,workshop,full_paper,fugit aspernatur pariatur fugiat eveniet venia...,Accepted,review7043263_0
