In [1]:

import pandas as pd
import numpy as np
import math as m
import os
import csv
import random as rn
from random import randint
from scipy.stats import beta

# Quantities to seed

papers_number = 10000
readers_number = 2500
authors_number = 25

papers = np.arange(papers_number)
readers = np.arange(readers_number)
authors = np.arange(authors_number)

# Seed folder path

dataset_name = "seed_2/p_2_beta"
dataset_folder_path = f"../data/{dataset_name}/"
info_file_path = f"{dataset_folder_path}info.csv"
ratings_file_path = f"{dataset_folder_path}ratings.csv"
authors_file_path = f"{dataset_folder_path}authors.csv"

os.makedirs(dataset_folder_path, exist_ok=True)

print("DATASET NAME: ", dataset_name)
print("DATASET FOLDER PATH: ", dataset_folder_path)
print("INFO FILE PATH: ", info_file_path)
print("RATINGS FILE PATH: ", ratings_file_path)
print("AUTHORS FILE PATH: ", authors_file_path)

# Papers distribution generation with beta distribution

print("---------- PAPER DISTRIBUTIONS GENERATION STARTED ----------")

# CASE 1: a == b == 1, 5% of papers
beta_distributions_frequencies = [(m.floor((5*papers_number)/100), (1, 1))]
# CASE 2: a == b > 1, 30% of papers
a = randint(2, 10)
b = a
beta_distributions_frequencies.append((m.floor((30*papers_number)/100), (a, b)))
# CASE 3: 0 < (a ^ b) < 1, 30% of papers
a = rn.uniform(0.001, 1)
b = rn.uniform(0.001, 1)
beta_distributions_frequencies.append((m.floor((20*papers_number)/100), (a, b)))
# CASE 4: (a V b) == 1, (a > b V b > a), 20% of papers
a = 1
b = randint(1, 10)
if rn.randint(0,1) > 0.5:
    a, b = b, a
beta_distributions_frequencies.append((m.floor((30*papers_number)/100), (a, b)))
# CASE 5: (a ^ b) > 1, (a > b V b > a), 15% of papers
a = randint(2, 10)
b = randint(2 + a, 10 + a)
if rn.randint(0,1) > 0.5:
    a, b = b, a
beta_distributions_frequencies.append((m.floor((15*papers_number)/100), (a, b)))

papers_set = set(papers)
paper_distributions = [None] * papers_number

generated_papers_distributions = 0
for (papers_amount, (a, b)) in beta_distributions_frequencies:
    current_paper_set = rn.sample(papers_set, papers_amount)
    for paper in current_paper_set:
        percentage = 100*generated_papers_distributions/papers_number
        if percentage % 10 == 0:
            print(f"{int(generated_papers_distributions)}/{papers_number} ({int(percentage)}/100%)")
        paper_distributions[paper] = beta(a=a, b=b)
        generated_papers_distributions = generated_papers_distributions + 1
        papers_set.remove(paper)
print(f"{papers_number}/{papers_number} (100/100%)")

print("---------- PAPER DISTRIBUTIONS GENERATION COMPLETED ----------")


DATASET NAME:  seed_2/p_2_beta
DATASET FOLDER PATH:  ../data/seed_2/p_2_beta/
INFO FILE PATH:  ../data/seed_2/p_2_beta/info.csv
RATINGS FILE PATH:  ../data/seed_2/p_2_beta/ratings.csv
AUTHORS FILE PATH:  ../data/seed_2/p_2_beta/authors.csv
---------- PAPER DISTRIBUTIONS GENERATION STARTED ----------
0/10000 (0/100%)
1000/10000 (10/100%)
2000/10000 (20/100%)
3000/10000 (30/100%)
4000/10000 (40/100%)
5000/10000 (50/100%)
6000/10000 (60/100%)
7000/10000 (70/100%)
8000/10000 (80/100%)
9000/10000 (90/100%)
10000/10000 (100/100%)
---------- PAPER DISTRIBUTIONS GENERATION COMPLETED ----------


In [2]:

# Ratings file generation

# N sets of readers, each one has X% of the total

readers_percent = 20
reader_sets_number = m.floor(100 / readers_percent)
readers_amount = m.floor((readers_number*readers_percent)/100)

readers_set = set(readers)
readers_sets = []

# Readers rate papers with a certain frequence

paper_frequencies = [2, 4, 8, 30, 90]

print("---------- READERS SETS GENERATION STARTED ----------")

ratings_number = sum(paper_frequencies) * readers_amount
for x in range(0, reader_sets_number):
    current_readers_set = rn.sample(readers_set, readers_amount)
    # Removing last index
    if readers_number in current_readers_set: current_readers_set.remove(readers_number)
    readers_sets.append(current_readers_set)
    for reader in current_readers_set:
        readers_set.remove(reader)
    print(f"SET {x}: ", current_readers_set)
     
print("---------- READERS SETS GENERATION COMPLETED ----------")

print("---------- RATINGS GENERATION STARTED ----------")

generated_ratings = 0
rated_papers = []
rated_readers = []
with open(ratings_file_path, mode='w', newline='') as ratings_file:
    ratings_writer = csv.writer(ratings_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    ratings_writer.writerow(['Timestamp', 'Reader', 'Paper', 'Score'])
    for current_set in range(0, reader_sets_number):
        paper_per_reader = paper_frequencies[current_set]
        readers_set = readers_sets[current_set]
        for reader in readers_set:
            for index, paper in enumerate(rn.sample(set(papers), paper_per_reader)):
                paper_distribution = paper_distributions[paper]
                percentage = 100*generated_ratings/ratings_number
                if percentage % 10 == 0:
                    print(f"{int(generated_ratings)}/{ratings_number} ({int(percentage)}/100%)")
                current_tuple = {
                    "Reader": reader, 
                    "Paper": paper, 
                    "Score": round(paper_distribution.rvs(1)[0], 2), 
                }
                ratings_writer.writerow([generated_ratings, current_tuple["Reader"], current_tuple["Paper"], current_tuple["Score"]])
                generated_ratings+=1
                rated_readers.append(reader)
                rated_papers.append(paper)
    
    # Filling gaps
    unrated_papers = set(papers) - set(rated_papers)    
    for paper in unrated_papers:
        for reader in rn.sample(set(readers), 3): 
            paper_distribution = paper_distributions[paper]
            current_tuple = {
                "Reader": reader, 
                "Paper": paper, 
                "Score": round(paper_distribution.rvs(1)[0], 2), 
            }
            ratings_writer.writerow([generated_ratings, current_tuple["Reader"], current_tuple["Paper"], current_tuple["Score"]])    
            generated_ratings = generated_ratings + 1

    print(f"{ratings_number}/{ratings_number} (100/100%)")
    
ratings_file.close()

print("---------- RATINGS GENERATION ENDED ----------")


---------- READERS SETS GENERATION STARTED ----------
SET 0:  [1836, 1927, 1203, 1964, 930, 1148, 951, 671, 1137, 2480, 12, 1878, 1329, 993, 657, 1846, 1571, 2287, 999, 1941, 1825, 526, 1806, 289, 2150, 62, 2225, 2002, 72, 803, 783, 210, 2361, 1404, 1860, 1525, 2272, 8, 1268, 2253, 1617, 1723, 1645, 821, 1264, 2062, 355, 607, 2459, 127, 967, 1342, 2252, 502, 1559, 157, 1649, 1021, 864, 990, 1066, 2351, 363, 1141, 1273, 1984, 1925, 227, 2340, 479, 1241, 1522, 2204, 1506, 41, 89, 2046, 257, 477, 1037, 2008, 467, 1440, 742, 638, 997, 1996, 2055, 1054, 564, 1636, 887, 1998, 326, 388, 1539, 995, 2366, 1101, 641, 1230, 2496, 193, 1008, 1900, 2156, 1371, 2194, 1108, 490, 2405, 2170, 1816, 1176, 13, 2072, 279, 933, 2065, 2364, 604, 34, 2455, 1155, 423, 178, 2105, 1768, 1383, 1402, 2197, 2177, 1479, 2157, 1586, 1595, 2210, 1181, 393, 1775, 622, 1726, 1236, 484, 2107, 1651, 685, 261, 1527, 2299, 493, 600, 1892, 968, 1109, 33, 412, 782, 438, 1481, 1406, 1537, 939, 832, 1746, 1593, 2293, 2486, 152

In [3]:

# Authors file generation

print("---------- AUTHORS GENERATION STARTED ----------")

with open(authors_file_path, mode='w', newline='') as authors_file:
    authors_writer = csv.writer(authors_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    authors_writer.writerow(["Author", "Paper"])
    for index, author in enumerate(authors):
        percentage = 100*index/authors_number
        if percentage % 10 == 0:
            print(f"{int(index)}/{authors_number} ({int(percentage)}/100%)")
        # An author writes a number of paper between 1 and paper_fraction
        author_papers_number = rn.randint(1, (papers_number-1))
        papers_written = np.random.choice(papers, author_papers_number).tolist()
        papers_written = set(papers_written)
        if len(papers_written) > 1:
            papers_written = map(str, list(papers_written))
            papers_written = ";".join(papers_written)
        authors_writer.writerow([author, papers_written])
    print(f"{authors_number}/{authors_number} (100/100%)")
authors_file.close()
        
print("---------- AUTHORS GENERATION ENDED ----------")


---------- AUTHORS GENERATION STARTED ----------
0/25 (0/100%)
5/25 (20/100%)
10/25 (40/100%)
15/25 (60/100%)
20/25 (80/100%)
25/25 (100/100%)
---------- AUTHORS GENERATION ENDED ----------


In [4]:

# Info file generation

print("---------- INFO GENERATION STARTED ----------")

info_dataframe = pd.DataFrame(columns=["Dataset", "Paper", "Reader", "Rating", "Author"])
info_dataframe = info_dataframe.append(
    {
        "Dataset": dataset_name, 
        "Paper": papers_number, 
        "Reader": readers_number, 
        "Rating": ratings_number, 
        "Author": authors_number
    }, ignore_index=True)
info_dataframe.to_csv(info_file_path, index=False)

print("---------- INFO GENERATION ENDED ----------")

---------- INFO GENERATION STARTED ----------
---------- INFO GENERATION ENDED ----------
