In [1]:
## Import Libraries
import numpy as np 
import pandas as pd
import random
import csv
import lorem
import gc
from sklearn.model_selection import train_test_split
from faker import Faker

In [2]:
## Read CSV
url = './datafinal/'

# Read all data from articles 
## Article headers
with open(url+'output_article_header.csv') as f:
    article_header = f.readline().split(';')

## Articles in dataset
articles = pd.read_csv(url+'output_article.csv', header=0, names=article_header, sep=';');

In [3]:
# Selecting only 5000 articles
art_sel = articles[:5000]

# Free Memory
del articles
gc.collect()

0

In [4]:
##Assign Journal and conferences
art_sel['sup:string[]'] = np.random.choice(['J', 'C'], size=len(art_sel), p=[0.5, 0.5])

# Identify categorical variables, 'publisher:string'
categorical_vars = ['author:string[]', 'article:ID','author-orcid:string[]']

# Remove rows with missing values in categorical variables
art_sel = art_sel.dropna(subset=categorical_vars)

In [5]:
#Rename Columns
art_sel.rename(columns={'article:ID': "DOI",'author:string[]':"AuthorName",'author-aux:string': "author-aux",
       'author-orcid:string[]': "Author-orcid", 'booktitle:string': "booktitle", 'cdate:date': "cdate",
       'cdrom:string': "cdrom", 'cite:string[]': "cite", 'cite-label:string[]': "cite-label",
       'crossref:string': "crossref", 'editor:string[]': "editor", 'editor-orcid:string[]': "editor-orcid",
       'ee:string[]': "ee", 'ee-type:string[]':"ee-type", 'i:string[]':"i", 'journal:string': "journal",
       'key:string': "key", 'mdate:date': "mdate", 'month:string':"month", 'note:string[]':"note",
       'note-label:string':"note-label", 'note-type:string[]':"note-type", 'number:string': "number",
       'pages:string': "pages", 'publisher:string':"publisher", 'publnr:string':"publnr", 'publtype:string':"publtype",
       'sub:string[]':"sub", 'sup:string[]':"VenueType", 'title:string':"Title", 'title-bibtex:string':"title-bibtex",
       'tt:string[]':"tt", 'url:string[]':"url", 'volume:string':"VolumeNr", 'year:int': "YearPublished"
      }, inplace=True)

In [6]:
# Assign PaperType
ref_C = ["FullPaper", "ShortPaper", "DemoPaper", "Poster"]
ref_J = ["FullPaper", "ShortPaper", "DemoPaper"]

def assign_citation_type(x):
    if x['VenueType'] != 'C':
        return random.choice(ref_J)
    else:
        return random.choice(ref_C)

art_sel['PaperType'] = art_sel.apply(assign_citation_type, axis=1)

In [7]:
## Generate abstracts using Lorem library
abstracts = []

for i in range(art_sel.shape[0]):
    abstracts.append(lorem.paragraph())
        
art_sel['Abstract'] = abstracts

In [8]:
##Assign Editor and Chair
fake = Faker()

names = [fake.name() for _ in range(1500)]

##Editor
def assign_editor_name(x):
    if x['VenueType'] != 'C':
        return random.choice(names)
    else:
        return None
art_sel['EditorName'] = art_sel.apply(assign_editor_name, axis=1)
                                      
##Chair                                      
art_sel['chair']=None
def assign_chair_type(x):
    if x['VenueType'] == 'C':
        return random.choice(names)
    else:
        return None
art_sel['ChairName'] = art_sel.apply(assign_chair_type, axis=1)

In [9]:
##Assign Affiliation
fake = Faker()

names = [fake.company() for _ in range(500)]

##Affiliation
def assign_affiliation_name(x):
    split_title = x["AuthorName"].split("|")
    affiliations=[]
    for i in split_title:
        affiliations.append(random.choice(names))
    return '|'.join(affiliations)

art_sel['Affiliation'] = art_sel.apply(assign_affiliation_name, axis=1)


exp = range(1, 10)
##Reviewer
def assign_reviewerexp(x):
    return random.choice(exp)
art_sel['YearsReviewerExperience1'] = art_sel.apply(assign_reviewerexp, axis=1)
art_sel['YearsReviewerExperience2'] = art_sel.apply(assign_reviewerexp, axis=1)


exp = [True, False]
##Reviewer
def assign_production(x):
    if x['VenueType'] == 'J':
        return random.choice(exp)
    else:
        return None
art_sel['ManagesProductionProcess'] = art_sel.apply(assign_production, axis=1)


exp = [True, False]
##Reviewer
def assign_panels(x):
    if x['VenueType'] == 'C':
        return random.choice(exp)
    else:
        return None
art_sel['ManagesPanelsContent'] = art_sel.apply(assign_panels, axis=1)


exp = ['annual', 'biennial','triennial','quadrennial','irregular']
##Reviewer
def assign_frequency(x):
    if x['VenueType'] == 'C':
        return random.choice(exp)
    else:
        return None
art_sel['Frequency'] = art_sel.apply(assign_frequency, axis=1)


##Reviewer
def assign_journalfactor(x):
    if x['VenueType'] == 'J':
        return random.random()*50
    else:
        return None
art_sel['JournalImpactFactor'] = art_sel.apply(assign_journalfactor, axis=1)

In [10]:
## Assign Conference type and Name

# Initialize Faker
fake = Faker()

# Define conference names
conference_names = [fake.catch_phrase() for _ in range(200)]

# Define conference types    
ref_O = ["W", "S", "E", "R"]
def assign_conference_type(x):
    if x['VenueType'] == 'C':
        return random.choice(ref_O)
    else:
        return None

# Define function to assign conference names
def assign_conference_name(x):
    if x['ConfType'] == 'W':
        return random.choice(conference_names[:50])
    elif x['ConfType'] == 'S':
        return random.choice(conference_names[50:100])
    elif x['ConfType'] == 'E':
        return random.choice(conference_names[100:150])
    elif x['ConfType'] == 'R':
        return random.choice(conference_names[150:200])
    else:
        return None

    
# Apply function to dataframe
art_sel['ConfType'] = art_sel.apply(assign_conference_type, axis=1)
art_sel['ConfName'] = art_sel.apply(assign_conference_name, axis=1)

In [11]:
## Assign Journal name

# Define journal names
journal_names = []
while len(journal_names) < 200:
    phrase = fake.catch_phrase()
    if phrase not in conference_names and phrase not in journal_names:
        journal_names.append(phrase)


def assign_journal_name(x):
    if x['VenueType'] == 'J':
        return random.choice(journal_names)
    else:
        return None
art_sel['JournalName'] = art_sel.apply(assign_journal_name, axis=1)

In [12]:
### Reviews
## articles for reviews 
art_rev_selected = art_sel[['DOI', 'AuthorName', 'Title']]

# Creating list of reviewers
split_authors = art_rev_selected['AuthorName'].str.split("|", expand=True)
authors_list = []

for i in split_authors[split_authors.columns[0]].values.tolist():
    authors_list.append(i)
        
my_auth_list = list(set(list(authors_list)))

# remove nan from first column
my_auth_list = my_auth_list[1:]

In [13]:
## Assigning authors list to reviews
random.seed(42)

# define list of values to randomly assign
my_list = my_auth_list.copy()

# define function to randomly assign value from list to new column, avoiding strings and ensuring ReviewerName2 is different from ReviewerName1
def assign_value(row, my_list, reviewer1_val):
    new_value = random.choice(my_list)
    while any([str(new_value) in str(x) for x in row.values]) or new_value == reviewer1_val:
        new_value = random.choice(my_list)
    return new_value

# apply function to create new column with randomly assigned values
art_sel['ReviewerName1'] = art_sel.apply(lambda row: assign_value(row, my_list, None), axis=1)
art_sel['ReviewerName2'] = art_sel.apply(lambda row: assign_value(row, my_list, row['ReviewerName1']), axis=1)


# Create reviews using Lorem library
def assign_review(row):
    return lorem.sentence()
        
art_sel['ReviewText1'] = art_sel.apply(assign_review, axis=1)
art_sel['ReviewText2'] = art_sel.apply(assign_review, axis=1)

art_sel['ReviewDecision1'] = np.random.choice([True, False], size=len(art_sel), p=[0.80, 0.20])
art_sel['ReviewDecision2'] = np.random.choice([True, False], size=len(art_sel), p=[0.80, 0.20])


In [14]:
## final decision
def assign_FinalDecision(x):
    if x['ReviewDecision1'] == True and x['ReviewDecision2'] == True :
        return True
    else:
        return False

art_sel['FinalDecision'] = art_sel.apply(assign_FinalDecision, axis=1)

In [15]:
# Year Published
art_sel['YearPublished'] = art_sel['YearPublished'].astype(int)

In [16]:
## Keywords

## Generate keywords from title
db_keywords = ['data management', 'indexing', 'data modeling', 'big data', 
               'data processing', 'database', 'data querying']

def get_keywords(x):
    split_title = art_sel["Title"].str.split(" ", expand=True)
    kwords = []
    for i in split_title[split_title.columns[0]].values.tolist():
        if len(str(i)) == 15:
            kwords.append(i)
    all_keywords = db_keywords + kwords
    return '|'.join(random.sample(all_keywords, k=5))

art_sel['Keywords'] = art_sel.apply(get_keywords, axis=1)


In [17]:
# create new column with randomly assigned values area
def assign_area(x):
    return random.choice(db_keywords)

art_sel['Area'] = art_sel.apply(assign_area, axis=1)

In [18]:
## Assign Index Number and Volume Number

indexnr=[]
volumenr=[]
indexnrp=range(999, 9999)
volumenrp=range(99, 999)
for x in art_sel['VenueType']:
    temp_ind=None
    temp_volume=None
    if x == 'J':
        temp_volume=str(int(random.choice(volumenrp)))
    else:
        temp_ind=str(int(random.choice(indexnrp)))
    indexnr.append(temp_ind)
    volumenr.append(temp_volume)

art_sel['IndexNr'] =indexnr
art_sel['VolumeNr'] = volumenr


In [19]:
## Assign AuthorIDs

# Initialize Faker
fake = Faker()

# Dictionary to store author IDs
author_ids = {}

# Generate unique IDs for authors in each row
for row in art_sel['AuthorName']:
    authors = row.split("|")
    for author in authors:
        if author not in author_ids:
            #author_id = fake.unique.random_number(digits=9)
            author_id = fake.unique.sbn9()
            author_ids[author] = author_id

# Create a new column with unique IDs of authors concatenated in the same format
art_sel['Author-orcid'] = art_sel['AuthorName'].apply(lambda row: '|'.join([str(author_ids[author]) for author in row.split("|") if author in author_ids]))

In [20]:
## Citation

# Split for cite and citation
# split the dataframe into two parts i.e 0.33% for cite
art_citation, art_cite = train_test_split(art_sel, test_size=0.33, random_state=42)
# Ensure articles in citation are written before cite articles
values = [2017, 2018, 2019]

## randomly assign the values as a new column in the DataFrame
art_citation['YearPublished'] = np.random.RandomState(42).choice(values, len(art_citation))

# create a list of values to assign randomly from 2020-2022
values = [2020, 2021, 2022]
art_cite['YearPublished'] = np.random.RandomState(42).choice(values, len(art_cite))

## Selecting citation
cite_cita = art_citation[['author-aux', 'DOI']]

## Replicate the citation df so that each article has at least 2 citations
cite_cita = pd.concat([cite_cita]*1, ignore_index=True)

## randomly assign cite articles to citation
ref_in_art_cite = list(set(list(art_cite['DOI'])))

# Define function to randomly assignIDs
def assign_no():
    return random.choice(ref_in_art_cite)

art_sel['Citation_DOI'] = art_sel.apply(lambda x: assign_no(), axis=1)

In [21]:
art_sel.columns

Index(['DOI', 'AuthorName', 'author-aux', 'Author-orcid', 'booktitle', 'cdate',
       'cdrom', 'cite', 'cite-label', 'crossref', 'editor', 'editor-orcid',
       'ee', 'ee-type', 'i', 'journal', 'key', 'mdate', 'month', 'note',
       'note-label', 'note-type', 'number', 'pages', 'publisher', 'publnr',
       'publtype', 'sub', 'VenueType', 'Title', 'title-bibtex', 'tt', 'url',
       'VolumeNr', 'YearPublished', 'PaperType', 'Abstract', 'EditorName',
       'chair', 'ChairName', 'Affiliation', 'YearsReviewerExperience1',
       'YearsReviewerExperience2', 'ManagesProductionProcess',
       'ManagesPanelsContent', 'Frequency', 'JournalImpactFactor', 'ConfType',
       'ConfName', 'JournalName', 'ReviewerName1', 'ReviewerName2',
       'ReviewText1', 'ReviewText2', 'ReviewDecision1', 'ReviewDecision2',
       'FinalDecision', 'Keywords', 'Area', 'IndexNr', 'Citation_DOI'],
      dtype='object')

In [22]:
finalcolumns=['DOI', 'AuthorName', 'Author-orcid', 'VenueType', 'Title', 
              'YearPublished', 'PaperType', 'Abstract', 'EditorName',
              'ChairName', 'ConfName', 'ConfType', 'JournalName',
              'ReviewerName1', 'ReviewerName2', 'ReviewText1', 'ReviewText2',
              'ReviewDecision1', 'ReviewDecision2', 'FinalDecision', 'Keywords', 'VolumeNr',
              'IndexNr','Citation_DOI','Area','Affiliation','YearsReviewerExperience1','YearsReviewerExperience2',
              'ManagesProductionProcess','ManagesPanelsContent', 'Frequency',
              'JournalImpactFactor']


art_sel_final=art_sel[finalcolumns]

# Save to CSV
## Articles instances 
art_sel_final.to_csv(path_or_buf='./datafinal/instance_data.csv', index=False, header=True)

In [23]:
art_sel_final.head(5).T

Unnamed: 0,5,8,9,12,17
DOI,4041687,4042615,4042664,4044384,6154008
AuthorName,Hongjian Fan|Kotagiri Ramamohanarao,Morgan Ericsson,David L. Martin 0001|Deborah L. McGuinness|Dre...,Andrzej Cichocki|Dimitrios Georgakopoulos|Dona...,Emilio Coppa
Author-orcid,06-008489-6|88840-619-3,311-62634-X,582-54773-6|617-21442-0|313-87606-0|7650-5125-...,18-364501-9|7757-7478-5|456-87713-7|06-882933-1,358-73401-2
VenueType,C,J,J,J,J
Title,Patterns Based Classifiers.,The Effects of XML Compression on SOAP Perform...,Bringing Semantics to Web Services with OWL-S.,Event-driven Video Awareness Providing Physica...,An interactive visualization framework for per...
YearPublished,2007,2007,2007,2007,2015
PaperType,ShortPaper,FullPaper,FullPaper,ShortPaper,FullPaper
Abstract,Aliquam non quaerat dolor magnam eius sed eius...,Labore adipisci voluptatem quisquam ut modi si...,Porro voluptatem amet ipsum eius consectetur i...,Dolore etincidunt ut consectetur. Sit eius eiu...,Amet eius numquam amet numquam voluptatem non....
EditorName,,Jessica Collins,Carol Martinez,Shannon Taylor,Judy Smith
ChairName,Kristi Larson,,,,


In [24]:
art_sel_final.dtypes

DOI                           int64
AuthorName                   object
Author-orcid                 object
VenueType                    object
Title                        object
YearPublished                 int32
PaperType                    object
Abstract                     object
EditorName                   object
ChairName                    object
ConfName                     object
ConfType                     object
JournalName                  object
ReviewerName1                object
ReviewerName2                object
ReviewText1                  object
ReviewText2                  object
ReviewDecision1                bool
ReviewDecision2                bool
FinalDecision                  bool
Keywords                     object
VolumeNr                     object
IndexNr                      object
Citation_DOI                  int64
Area                         object
Affiliation                  object
YearsReviewerExperience1      int64
YearsReviewerExperience2    