In [32]:
import pandas as pd
import seaborn as sb
import numpy as np
import math as m
import os
import random as rn
from matplotlib import pyplot as plt
import scipy as sp
from scipy.stats import truncnorm as tn

# Quantities in an year of activity
papers_number = 150000
readers_number = 2000
authors_number = 50

papers = np.arange(papers_number)
readers = np.arange(readers_number)
authors = np.arange(authors_number)

# Seed folder path

dataset_name = "seed_1"
dataset_folder_path = f"../data/{dataset_name}/"
info_file_path = f"{dataset_folder_path}info.csv"
ratings_file_path = f"{dataset_folder_path}ratings.csv"
authors_file_path = f"{dataset_folder_path}authors.csv"

os.makedirs(dataset_folder_path, exist_ok=True)

print("DATASET NAME: ", dataset_name)
print("DATASET FOLDER PATH: ", dataset_folder_path)
print("INFO FILE PATH: ", info_file_path)
print("RATINGS FILE PATH: ", ratings_file_path)
print("AUTHORS FILE PATH: ", authors_file_path)


DATASET NAME:  seed_1
DATASET FOLDER PATH:  ../data/seed_1/
INFO FILE PATH:  ../data/seed_1/info.csv
RATINGS FILE PATH:  ../data/seed_1/ratings.csv
AUTHORS FILE PATH:  ../data/seed_1/authors.csv


In [None]:

# Ratings file generation

ratings_dataframe = pd.DataFrame(columns=["Timestamp", "Reader", "Paper", "Score", "Author"])
tuples = []

# N sets of readers, each one has X% of the total

readers_percent = 20
reader_sets_number = m.floor(100 / readers_percent)
readers_amount = round((readers_number*readers_percent)/100)

readers_set = set(readers)
readers_sets = []

# Readers of set 0 rate 1 paper every two weeks
# Readers of set 1 rate 1 paper every week
# Readers of set 2 rate 2 papers every week
# Readers of set 3 rate 1 paper every day
# Readers of set 4 rate 3 papers every day

paper_frequencies = [26, 52, 104, 365, 1098]
paper_distributions = np.empty(papers_number)
for index in range(0, papers_number):
    distribution = tn(0, 1, loc=rn.uniform(0, 1), scale=rn.uniform(0, 0.05)).rvs(1)
    paper_distributions[index] = distribution

ratings_number = sum(paper_frequencies) * readers_amount

print("---------- READERS SETS GENERATION STARTED ----------")

for x in range(0, reader_sets_number):
    current_readers_set = rn.sample(readers_set, readers_amount)
    readers_sets.append(current_readers_set)
    for reader in current_readers_set:
        readers_set.remove(reader)
    print(f"SET {x}: ", current_readers_set)
     
print("---------- READERS SETS GENERATION COMPLETED ----------")

print("---------- RANDOM RATINGS GENERATION STARTED ----------")

generated_ratings = 0
for current_set in range(0, reader_sets_number):
    paper_per_reader = paper_frequencies[current_set]
    readers_set = readers_sets[current_set]
    for reader in readers_set:
        for index, paper in enumerate(rn.sample(set(papers), paper_per_reader)):
            paper_distribution = paper_distributions[paper]
            percentage = 100*generated_ratings/ratings_number
            if percentage % 10 == 0:
                print(f"{int(generated_ratings)}/{ratings_number} ({int(percentage)}/100%)")
            current_tuple = {
                "Reader": reader, 
                "Paper": paper, 
                "Score": round(paper_distribution, 2), 
            }
            generated_ratings+=1
            tuples.append(current_tuple)
print(f"{ratings_number}/{ratings_number} (100/100%)")

print("---------- RANDOM RATINGS GENERATION ENDED ----------")

print("---------- RATINGS APPENDING STARTED ----------")
for index, current_tuple in enumerate(tuples):
    percentage = 100*index/ratings_number
    if percentage % 10 == 0:
        print(f"{index}/{len(tuples)} ({int(percentage)}/100%)")
    ratings_dataframe = ratings_dataframe.append(
        {
            "Timestamp": index,
            "Reader": current_tuple["Reader"],
            "Paper": current_tuple["Paper"], 
            "Score": current_tuple["Score"], 
        }, ignore_index=True)
print(f"{len(tuples)}/{len(tuples)} (100/100%)")
print("---------- RATINGS APPENDING ENDED ----------")

ratings_dataframe["Timestamp"] = ratings_dataframe["Timestamp"].astype(int)
ratings_dataframe["Reader"] = ratings_dataframe["Reader"].astype(int)
ratings_dataframe["Paper"] = ratings_dataframe["Paper"].astype(int)
ratings_dataframe.to_csv(ratings_file_path, index=False)


In [None]:

# Authors file generation

print("---------- RANDOM AUTHORS GENERATION STARTED ----------")
print("---------- RATINGS APPENDING STARTED ----------")
authors_dataframe = pd.DataFrame(columns=["Author", "Papers"])
for author in authors:
    # An author writes a number of paper between 1 and paper_fraction
    author_papers_number = rn.randint(1, (papers_number-1))
    papers_written = np.random.choice(papers, author_papers_number).tolist()
    papers_written = set(papers_written)
    if len(papers_written) > 1:
        papers_written = map(str, list(papers_written))
        papers_written = ";".join(papers_written)
    authors_dataframe = authors_dataframe.append(
        {
            "Author": author,
            "Papers": papers_written
        }, ignore_index=True)
print("---------- RANDOM AUTHORS GENERATION ENDED ----------")
print("---------- RATINGS APPENDING ENDED ----------")
authors_dataframe.to_csv(authors_file_path, index=False)


In [None]:

# Info file generation

info_dataframe = pd.DataFrame(columns=["Dataset", "Paper", "Reader", "Rating", "Author"])
info_dataframe = info_dataframe.append(
    {
        "Dataset": dataset_name.capitalize(), 
        "Paper": papers_number, 
        "Reader": readers_number, 
        "Rating": ratings_number, 
        "Author": authors_number
    }, ignore_index=True)
info_dataframe.to_csv(info_file_path, index=False)


In [None]:

# Summary

print("RANDOM INFO:     ", info_dataframe)
print("RANDOM RATINGS:  ", ratings_dataframe.head(10))
print("RANDOM AUTHORS:  ", authors_dataframe.head(10))
