In [1]:
import pandas as pd
import random


JOURNALS = ['UPC journal', 'IEEE Journal']
YEARS = ['2015', '2016', '2017', '2018','2019','2020','2021']
CONFERENCES = ['UDL conference', 'MAXsat conference']


NAME_AFFILIATIONS = ['UDL', 'UPC', 'Intel', 'Microsoft']
AFFILIATIONS = {'UDL': 'University', 'UPC': 'University', 'Intel':'Company', 'Microsoft':'Company'}
JOURNAL_EDITOR = 4
CONFERENCE_EDITOR = 4
OPINIONS = ['Nice paper, but not too much interesting.', 'Really good.', 'Too much boring.', 'Nice to know.']
PAPERS = 200
NUM_AUTHORS = 10
NUM_JOURNALS = int(PAPERS/3)
NUM_CONFERENCES = abs(PAPERS-NUM_JOURNALS)

KEYWORDS = ['CNN', 'NN', 'DeepLearning', 'pandas', 'python', 'neo4j', 'tensor flow', 'android', 'qbits']
CITIES = ['Barcelona', 'Lleida', 'Balaguer', 'London', 'Madrid']

author_papers = pd.read_csv("author_papers.csv")[:200]

# Generate editors

    - We will use different authors for editors, and authors of scinetific papers.

In [2]:
journal_editors = [author_papers.iloc[i]['author'].split('|')[0] for i in range(50, 50+JOURNAL_EDITOR)]
conference_editors = [author_papers.iloc[i]['author'].split('|')[0] for i in range(50+JOURNAL_EDITOR, 50+CONFERENCE_EDITOR*2)]


# Generate journals and conferences 

    - For each publication we will have: 
        - Author : Author of the paper. Only 6 authors available.
        - Journal: Journal name. Defined at the top of the file
        - Year: One of the years defined previously
        - Editor: One of the editor generated at the previous block of code.
        - City: One fo the cities defined previously
        - Edition: For each year, and conference/journal we will have some

In [3]:
journals = {'Author':[], 'Journal':[], 'Year':[], 'Paper':[], 'Editor':[], 'City':[], 'Edition':[]}
journal_papers = []
def gen_journal(journal_papers):
    for i in range(0, NUM_JOURNALS):
        authors = author_papers.iloc[random.randint(0,NUM_AUTHORS)]['author'].split('|')
        journal = JOURNALS[random.randint(0,len(JOURNALS)-1)]
        paper = author_papers.iloc[i]['title']
        journal_papers += [paper]
        year = YEARS[random.randint(0, len(YEARS)-1)]
        city = CITIES[random.randint(0, len(CITIES)-1)]
        editor = journal_editors[random.randint(0, len(journal_editors)-1)]
        for author in authors:
            journals['Author'] += [author]
            journals['Journal'] += [journal]
            journals['Year'] += [year]
            journals['Paper'] += [paper]
            journals['Editor'] += [editor]
            journals['City'] += [city]
            journals['Edition'] +=['edition %s %s' % ((year, journal))]
        

In [4]:
conferences = {'Author':[], 'Conference':[], 'Year':[], 'Paper':[], 'Editor':[], 'City':[], 'Edition':[]}
conference_papers = []
def gen_conference(conference_papers):
    for i in range(NUM_JOURNALS, PAPERS):
        authors = author_papers.iloc[random.randint(0,5)]['author'].split('|')
        conference = CONFERENCES[random.randint(0,len(JOURNALS)-1)]
        paper = author_papers.iloc[i]['title']
        conference_papers += [paper]
        year = YEARS[random.randint(0, len(YEARS)-1)]
        city = CITIES[random.randint(0, len(CITIES)-1)]
        editor = conference_editors[random.randint(0, len(conference_editors)-1)]
        for author in authors: 
            conferences['Author'] += [author]
            conferences['Conference'] += [conference]
            conferences['Year'] += [year]
            conferences['Paper'] += [paper]
            conferences['Editor'] += [editor]
            conferences['City'] += [city]
            conferences['Edition'] +=['edition %s %s' % ((year, conference))]




In [5]:
gen_conference(conference_papers)
gen_journal(journal_papers)

# Generate keywords 

In [6]:
all_papers = conference_papers + journal_papers
paper_keywords = {'Paper':[], 'Keyword':[]}
def gen_keywords():
    for paper in all_papers:
        random.shuffle(KEYWORDS)
        for word in [word for word in KEYWORDS[:random.randint(1,3)]]:
            paper_keywords['Paper'] += [paper]
            paper_keywords['Keyword'] += [word]

In [7]:
gen_keywords()

# Generate references

    - Randomly we will assign reference with a probability of 10% for each papers

In [31]:
papers_years = pd.concat([pd.DataFrame(conferences)[['Paper', 'Year']], pd.DataFrame(journals)[['Paper', 'Year']]])

In [35]:
references = {'paper':[], 'reference':[]}
def gen_references():
    for paper in all_papers:
        for reference in all_papers[1:]:
            year_paper = int(papers_years[papers_years['Paper']== paper].iloc[0]['Year'])
            year_reference = int(papers_years[papers_years['Paper']== reference].iloc[0]['Year'])
            if random.random() < 0.10 and year_paper >= year_reference:
                references['paper'] += [paper]
                references['reference'] +=[reference]

In [36]:
gen_references()

In [37]:
references

{'paper': ['Multi-Granularity Locking for Nested Transactions: A Proof Using a Possibilities Mapping.',
  'Multi-Granularity Locking for Nested Transactions: A Proof Using a Possibilities Mapping.',
  'Multi-Granularity Locking for Nested Transactions: A Proof Using a Possibilities Mapping.',
  'Multi-Granularity Locking for Nested Transactions: A Proof Using a Possibilities Mapping.',
  'Multi-Granularity Locking for Nested Transactions: A Proof Using a Possibilities Mapping.',
  'Multi-Granularity Locking for Nested Transactions: A Proof Using a Possibilities Mapping.',
  'Multi-Granularity Locking for Nested Transactions: A Proof Using a Possibilities Mapping.',
  'Multi-Granularity Locking for Nested Transactions: A Proof Using a Possibilities Mapping.',
  'Multi-Granularity Locking for Nested Transactions: A Proof Using a Possibilities Mapping.',
  'Bounded Quantification and Relations Recognizable by Finite Automata.',
  'Bounded Quantification and Relations Recognizable by Finit

# Generate reviews

    - Randomly we will assign one author for each paper published. Papers can be published in conferences and journals.
    - Randomly we assign more or less reviewers for each paper.
    - Randomly choose for each author organisation.

In [38]:
all_authors = journals['Author'] + conferences['Author']
reviews = {'Author':[], 'Editor':[], 'Paper':[], 'Affiliation':[], 'CompanyUniversity':[], 'Decision':[], 'Opinion':[]}
def gen_reviews(published_papers):
    editors = published_papers['Editor']
    papers = published_papers['Paper']
    for i in range(0, len(editors)):
        paper = papers[i]
        editor = editors[i]
        author = all_authors[random.randint(0,len(all_authors)-1)]
        for i in range(0,random.randint(2,4)):
            reviews['Paper'] += [paper]
            reviews['Editor'] += [editor]
            author_old = author
            while author_old == author:
                author = all_authors[random.randint(0,len(all_authors)-1)]
            reviews['Author'] += [author]
            affiliation = NAME_AFFILIATIONS[random.randint(0, len(NAME_AFFILIATIONS)-1)]
            reviews['Affiliation'] += [affiliation]
            reviews['CompanyUniversity'] += [AFFILIATIONS[affiliation]]
            reviews['Decision'] += [['Accepted','Denied'][random.randint(0,1)]]
            reviews['Opinion'] += [OPINIONS[random.randint(0,len(OPINIONS)-1)]]
    

In [39]:
gen_reviews(journals)
gen_reviews(conferences)

# Generating files

    - Name will be : "kind of file"_"value of PAPERS".csv, for example if it contains journals information, and PAPERS is equal to 50, the file will be named as: journals_50.csv 

In [40]:
pd.DataFrame(journals).to_csv("journals_%d.csv" % PAPERS)
pd.DataFrame(conferences).to_csv("conferences_%d.csv" % PAPERS)
pd.DataFrame(paper_keywords).to_csv("keywords_%d.csv" % PAPERS)
pd.DataFrame(reviews).to_csv("reviews_%d.csv" % PAPERS)
pd.DataFrame(references).to_csv("references_%d.csv" % PAPERS)