In [50]:
import pandas as pd
import numpy as np
import random
from lorem_text import lorem
import re

In [51]:
articles = pd.read_csv('article.csv', sep=';')
proceeding = pd.read_csv('proceeding_papers.csv', sep=';')

In [58]:
def generate_reviews(x):
    rev = x.split('|')
    reviews = ''
    for r in rev:
        if len(reviews) > 0:
            reviews += '|'
        reviews += lorem.words(10)
    return reviews

def generate_decisions(x):
    rev = x.split('|')
    decisions = ''
    for r in rev:
        if len(decisions) > 0:
            decisions += '|'
        decisions += 'Accepted'
    return decisions

def generate_decisionIDs(art, rev):
    reviewers = rev.split('|')
    IDs = ''
    for i, r in enumerate(reviewers):
        if len(IDs) > 0:
            IDs += '|'
        IDs += f"review{art}_{i}" 
    return IDs

def remove_non_alphabetic_characters(x):
    return re.sub(r'[^a-zA-Z|]', '', x)

def replace_capitals(x):
    return x.lower()

def remove_space(x):
    return x.replace(' ','')

def replace_null_authors(x):
    if type(x) == float:
        return "TheRiddler"
    else: return x
    
def replace_null_topics(x):
    if type(x) == float:
        return "Nothing"
    else: return x

def clean_instances(x):
    x = remove_non_alphabetic_characters(x)
    x = remove_space(x)
    return x

In [53]:
proceeding = proceeding.rename(columns = {'booktitle': 'conference',
                                          'inproctitle': 'articleTitle', 
                                          'proctitle': 'conferenceTitle',
                                          'keywords': 'topic',
                                          'inprocee': 'doi',
                                          'proceedings': 'conferenceID',
                                          'inproceedings': 'articleID'
                                         })



proceeding['conferenceID'] = proceeding.groupby(by=['conference', 'procmdate', 'conferenceTitle']).ngroup().add(1)
confID = proceeding['conferenceID'].unique()
authors = proceeding['author'].unique()

a = set([])
for aut in authors:
    if type(aut) == float:
        continue
    for au in aut.split('|'):
        a.add(au)

names = set([])
for author in a:
    for name in author.split(' '):
        names.add(name)

confNamePair = {}
confTypePair = {}
confType = ['workshop', 'symposium', 'expert_group', 'regular']
for ID in confID: 
    confNamePair[ID] = f'{random.sample(names, k=1)[0]} {random.sample(names, k=1)[0]}'
    confTypePair[ID] = random.sample(confType, k=1)[0]
    
def get_author(x):
    return confNamePair[x]

def get_type(x):
    return confTypePair[x]

proceeding['chair'] = proceeding['conferenceID'].apply(get_author)
proceeding['conferenceType'] = proceeding['conferenceID'].apply(get_type)

pro = set(proceeding['articleID'].unique())
confPaperTypePair = {}
confPaperTypes = ['poster', 'full_paper', 'demo_paper', 'short_paper']
for artID in pro:
    confPaperTypePair[artID] = random.sample(confPaperTypes, k=1)[0]
    
def get_paper_type(x):
    return confPaperTypePair[x]

proceeding['articleType'] = proceeding['articleID'].apply(get_paper_type)

proceeding['reviews'] = proceeding['reviewed_by'].apply(generate_reviews)
proceeding['decisions'] = proceeding['reviewed_by'].apply(generate_decisions)
proceeding['decisionID'] = proceeding.apply(lambda x: generate_decisionIDs(x.articleID, x.reviewed_by), axis=1)

proceeding['author'] = proceeding['author'].apply(replace_null_authors)

proceeding['author'] = proceeding['author'].apply(clean_instances)
proceeding['conference'] = proceeding['conference'].apply(clean_instances)
proceeding['reviewed_by'] = proceeding['reviewed_by'].apply(clean_instances)
proceeding['chair'] = proceeding['chair'].apply(clean_instances)
proceeding['topic'] = proceeding['topic'].apply(clean_instances)

proceeding['topic'] = proceeding['topic'].apply(replace_capitals)

proceeding = proceeding.drop(['volume',
                                 'url', 
                                 'author-orcid',
                                 'ee-type', 
                                 'pages', 
                                 'inprocmdate', 
                                 'author-orcid', 
                                 'abstract', 
                                 'co_authors',
                                 'citations',
                                 'isbn',
                                 'series',
                                 'publisher',
                                 'doi',
                                 'crossref',
                                 'key',
                                 'procee',
                                 'corresponding', 
                                 'procmdate', 
                                 'year'], 1)

proceeding.to_csv('proceeding_processed.csv')
proceeding[:5].to_csv('proceeding_slice.csv')

In [59]:
articles = articles.rename(columns = {'keywords': 'topics'})

articles['journalID'] = articles.groupby(by=['journal', 'volume']).ngroup().add(1)
jourID = articles['journalID'].unique()
authors = articles['author'].unique()

a = set([])
for aut in authors:
    if type(aut) == float:
        continue
    for au in aut.split('|'):
        a.add(au)

names = set([])
for author in a:
    for name in author.split(' '):
        names.add(name)
        
jourNamePair = {}
for ID in jourID:
    jourNamePair[ID] = f'{random.sample(names, k=1)[0]} {random.sample(names, k=1)[0]}'

def get_author(x):
    return jourNamePair[x]

articles['editor'] = articles['journalID'].apply(get_author)    


art = set(articles['article'].unique())
    
jourPaperTypePair = {}

jourPaperTypes = ['full_paper', 'demo_paper', 'short_paper']
for artID in art:
    jourPaperTypePair[artID] = random.sample(jourPaperTypes, k=1)[0]

def get_paper_type(x):
    return jourPaperTypePair[x]    

articles['paperType'] = articles['article'].apply(get_paper_type)

articles['reviews'] = articles['reviewed_by'].apply(generate_reviews)
articles['decisions'] = articles['reviewed_by'].apply(generate_decisions)
articles['decisionID'] = articles.apply(lambda x: generate_decisionIDs(x.article, x.reviewed_by), axis=1)

articles['author'] = articles['author'].apply(replace_null_authors)
articles['topics'] = articles['topics'].apply(replace_null_topics)

articles['author'] = articles['author'].apply(clean_instances)
articles['journal'] = articles['journal'].apply(clean_instances)
articles['reviewed_by'] = articles['reviewed_by'].apply(clean_instances)
articles['editor'] = articles['editor'].apply(clean_instances)
articles['topics'] = articles['topics'].apply(clean_instances)

articles['topics'] = articles['topics'].apply(replace_capitals)


articles = articles.drop(['key',
                          'author-orcid',
                          'ee',
                          'ee-type',
                          'pages',
                          'publtype',
                          'abstract',
                          'citations',
                          'url',
                          'key',
                          'corresponding',
                          'co_authors',
                          'correspondingID',
                          'mdate', 
                          'year'],1)

articles.to_csv('articles_processed.csv')
articles[:5].to_csv('article_slice.csv')

In [64]:
for top in articles['topics']:
    if "\n" in top:
        print(top)



