In [5]:
import pandas as pd
# import seaborn as sb
import numpy as np
import math as m
import os
import collections
import csv
import random as rn
# from pprint import pprint
# from matplotlib import pyplot as plt
# import scipy as sp
# from scipy.stats import truncnorm as tn
from scipy.stats import beta as beta

# Parameter setting

dataset_name = "seed_2_with_shuffle"
papers_number = 300
readers_number = 1000
authors_number = 40
months_number = 1
paper_frequencies = [
    2 * months_number, 
    6 * months_number, 
    8 * months_number, 
    14 * months_number, 
    20 * months_number
]
shuffling = True
shuffle_number = 100

assert (papers_number > (sum(paper_frequencies)) and (papers_number % 10) == 0), \
    "ERROR: papers_number must be greater than (equal to) {} and it must be a multiple of 10.".format(sum(paper_frequencies)) 

# Seed folder path

dataset_folder_path = "../data/{}/".format(dataset_name)
dataset_shuffle_folder_path = "../data/{}/shuffle/".format(dataset_name)
info_file_path = "{}info.csv".format(dataset_folder_path)
ratings_file_path = "{}ratings.csv".format(dataset_folder_path)
authors_file_path = "{}authors.csv".format(dataset_folder_path)
stats_file_path = "{}stats.csv".format(dataset_folder_path)

# Setting up arrays

papers = np.arange(papers_number)
readers = np.arange(readers_number)
authors = np.arange(authors_number)

os.makedirs(dataset_folder_path, exist_ok=True)

print("DATASET NAME: ", dataset_name)
print("DATASET FOLDER PATH: ", dataset_folder_path)
print("INFO FILE PATH: ", info_file_path)
print("RATINGS FILE PATH: ", ratings_file_path)
print("AUTHORS FILE PATH: ", authors_file_path)

DATASET NAME:  seed_2_with_shuffle
DATASET FOLDER PATH:  ../data/seed_2_with_shuffle/
INFO FILE PATH:  ../data/seed_2_with_shuffle/info.csv
RATINGS FILE PATH:  ../data/seed_2_with_shuffle/ratings.csv
AUTHORS FILE PATH:  ../data/seed_2_with_shuffle/authors.csv


In [6]:
# Papers distribution generation with beta distribution

print("---------- PAPER DISTRIBUTIONS GENERATION STARTED ----------")

# CASE 1: a == b == 1, 5% of papers
beta_distributions_frequencies = [(int(round((5*papers_number/100))), (1, 1))]
# CASE 2: a == b > 1, 30% of papers
a = rn.randint(2, 10)
b = a
beta_distributions_frequencies.append((int(round(30*papers_number/100)), (a, b)))
# CASE 3: 0 < (a ^ b) < 1, 30% of papers
a = rn.uniform(0.001, 1)
b = rn.uniform(0.001, 1)
beta_distributions_frequencies.append((int(round(20*papers_number/100)), (a, b)))
# CASE 4: (a V b) == 1, (a > b V b > a), 20% of papers
a = 1
b = rn.randint(1, 10)
if rn.randint(0,1) > 0.5:
    a, b = b, a
beta_distributions_frequencies.append((int(round(30*papers_number/100)), (a, b)))
# CASE 5: (a ^ b) > 1, (a > b V b > a), 15% of papers
a = rn.randint(2, 10)
b = rn.randint(2 + a, 10 + a)
if rn.randint(0,1) > 0.5:
    a, b = b, a
beta_distributions_frequencies.append((int(round(15*papers_number/100)), (a, b)))

papers_set = set(papers)
paper_distributions = [None] * papers_number

generated_papers_distributions = 0
for (papers_amount, (a, b)) in beta_distributions_frequencies:
    current_paper_set = rn.sample(papers_set, papers_amount)
    for paper in current_paper_set:
        percentage = 100*generated_papers_distributions/papers_number
        if percentage % 10 == 0:
            print("{}/{} ({}/100%)".format(int(generated_papers_distributions), papers_number, int(percentage)))
        paper_distributions[paper] = beta(a=a, b=b)
        generated_papers_distributions = generated_papers_distributions + 1
        papers_set.remove(paper)
print("{}/{} (100/100%)".format(papers_number, papers_number))

print("---------- PAPER DISTRIBUTIONS GENERATION COMPLETED ----------")

---------- PAPER DISTRIBUTIONS GENERATION STARTED ----------
0/300 (0/100%)
30/300 (10/100%)
60/300 (20/100%)
90/300 (30/100%)
120/300 (40/100%)
150/300 (50/100%)
180/300 (60/100%)
210/300 (70/100%)
240/300 (80/100%)
270/300 (90/100%)
300/300 (100/100%)
---------- PAPER DISTRIBUTIONS GENERATION COMPLETED ----------


In [7]:

# Ratings file generation

# N sets of readers, each one has X% of the total

readers_percent = 20
reader_sets_number = m.floor(100 / readers_percent)
readers_amount = m.floor((readers_number*readers_percent)/100)

readers_sets = []

# Readers rate papers with a certain frequency

print("---------- READERS SETS GENERATION STARTED ----------")

ratings_number = sum(paper_frequencies) * readers_amount

for x in range(0, reader_sets_number):
    current_readers_set = np.random.choice(readers, readers_amount, False) 
    readers = np.setdiff1d(readers, current_readers_set)
    readers_sets.append(current_readers_set)
    print("SET {}: {}".format(x, current_readers_set))

print("---------- READERS SETS GENERATION COMPLETED ----------")

print("---------- RATINGS GENERATION STARTED ----------")

generated_ratings = 0
rated_papers = []
with open(ratings_file_path, mode='w', newline='') as ratings_file:
    ratings_writer = csv.writer(ratings_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    ratings_writer.writerow(['Timestamp', 'Reader', 'Paper', 'Score'])
    for current_set in range(0, reader_sets_number):
        frequency = paper_frequencies[current_set]
        readers_set = readers_sets[current_set]
        for reader in readers_set:
            sample = np.random.choice(papers, frequency, False)     
            for paper in sample:    
                paper_distribution = paper_distributions[paper]
                percentage = 100*generated_ratings/ratings_number
                if percentage % 10 == 0:
                    print("{}/{} ({}/100%)".format(int(generated_ratings), ratings_number, int(percentage)))
                generated_rating = round(paper_distribution.rvs(1)[0], 2)
                if generated_rating == 0:
                    generated_rating = 0.01
                ratings_writer.writerow([
                    generated_ratings, 
                    reader, 
                    paper, 
                    generated_rating
                ])
                rated_papers.append(paper)
                generated_ratings+=1
    
    # Filling gaps
    readers = np.arange(readers_number)
    unrated_papers = set(papers) - set(rated_papers)    
    for paper in unrated_papers:
        for reader in np.random.choice(readers, 5, False):
            paper_distribution = paper_distributions[paper]
            generated_rating = round(paper_distribution.rvs(1)[0], 2)
            if generated_rating == 0:
                generated_rating = 0.01
                ratings_writer.writerow([
                    generated_ratings, 
                    reader, 
                    paper,
                    generated_rating
                ])
                generated_ratings+=1
        
    print("{}/{} (100/100%)".format(ratings_number, ratings_number))
    
ratings_file.close()

paper_ratings = pd.read_csv(ratings_file_path)
paper_ratings = paper_ratings.sample(frac=1)
paper_ratings["Timestamp"] = range(len(paper_ratings))
paper_ratings.reset_index(drop=True, inplace=True)

paper_ratings.to_csv(ratings_file_path, index=False, header=True, sep=",")

print("---------- RATINGS GENERATION ENDED ----------")

---------- READERS SETS GENERATION STARTED ----------
SET 0:  [892  74  84 241 290 626 460 220 708 792 740 121 660 766 655 856 425  47
 148 434 794 379  19 661 774 429 586 804 545 224 922 953 401 252 837 877
 131 711 569 789 383 704 964 666 211 183 347 195 391 863 717 587 467 171
 366 606 207 523 338 267 244 307 791 190 279 556 280 358 437 504 503 737
 143   1  44 748 594 653 146 955 814  13 435 227 806 870 881 388 152 628
 588 937 400 880 355  77 533 701 337 859 788  35 613  82 908  89 729 759
 396 499 393 125 332  70 202 944 776 574 658 700 810 621 894   7 489 679
 597 443 179 407 539 743 181 676 193 477 728 659 973 682 415 684 487 105
 758 984 878 298   3 869 155 617 514 304 114 761 512 885 214 454 905 541
  76  33 818 584 526 602  11 150 909 913 898 480 475 128 929 932 398 703
 118 548 192 820 581 449 858 596 873 603 117 172 582 318 715  72  21 744
 228 671]
SET 1:  [511 673 904 819 857 614 967 349 828 800 809 971  24 387 540 993 409 456
 558 853 323 524 713 978 109 706 769 886  91

In [8]:
print("---------- RATINGS SHUFFLING STARTED ----------")

if shuffling:
    os.makedirs(dataset_shuffle_folder_path, exist_ok=True)
    for s in range(shuffle_number):
        c = 0
        if s % 10 == 0:
            print("{}/{} ({}/100%)".format(s, shuffle_number, s))
        current_shuffle_file_path = "{}/shuffle_{}.csv".format(dataset_shuffle_folder_path, s)
        shuffled_papers_ratings = paper_ratings.sample(frac=1)
        for i, row in shuffled_papers_ratings.iterrows():
            shuffled_papers_ratings.at[i,'Timestamp'] = c
            c  = c + 1
        shuffled_papers_ratings.to_csv(current_shuffle_file_path, index=False, header=True, sep=",")
    print("{}/{} (100/100%)".format(shuffle_number, shuffle_number))
    
print("---------- RATINGS SHUFFLING COMPLETED ----------")

---------- RATINGS SHUFFLING STARTED ----------
0/100 (0/100%)
10/100 (10/100%)
20/100 (20/100%)
30/100 (30/100%)


KeyboardInterrupt: 

In [None]:
# Authors file generation

print("---------- AUTHORS GENERATION STARTED ----------")

with open(authors_file_path, mode='w', newline='') as authors_file:
    authors_writer = csv.writer(authors_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    authors_writer.writerow(["Author", "Paper"])
    for index, author in enumerate(authors):
        percentage = 100*index/authors_number
        if percentage % 10 == 0:
            print("{}/{} ({}/100%)".format(int(index), authors_number, int(percentage)))
        # An author writes a number of paper between 1 and paper_fraction
        author_papers_number = rn.randint(1, (papers_number-1))
        papers_written = np.random.choice(papers, author_papers_number).tolist()
        papers_written = set(papers_written)
        if len(papers_written) > 1:
            papers_written = map(str, list(papers_written))
            papers_written = ";".join(papers_written)
        else:
            papers_written = list(papers_written)[0]
        authors_writer.writerow([author, papers_written])
    print("{}/{} (100/100%)".format(authors_number, authors_number))
authors_file.close()
        
print("---------- AUTHORS GENERATION ENDED ----------")


In [None]:
# Info file generation

print("---------- INFO GENERATION STARTED ----------")

info_dataframe = pd.DataFrame(columns=["Dataset", "Paper", "Reader", "Rating", "Author"])
info_dataframe = info_dataframe.append(
    {
        "Dataset": dataset_name, 
        "Paper": papers_number, 
        "Reader": readers_number, 
        "Rating": ratings_number, 
        "Author": authors_number
    }, ignore_index=True)
info_dataframe.to_csv(info_file_path, index=False)

print("---------- INFO GENERATION ENDED ----------")

In [None]:
# Stats file generation

print("---------- STATS GENERATION STARTED ----------")

temp_ratings_dataframe = pd.read_csv(ratings_file_path)
temp_ratings_dataframe[temp_ratings_dataframe.columns] = temp_ratings_dataframe[temp_ratings_dataframe.columns].apply(pd.to_numeric)

stats_dataframe = temp_ratings_dataframe.copy()
stats_dataframe[stats_dataframe > 0.0000001] = 1

print("---------- COMPUTING STATS FOR PAPERS ----------")

sums_paper = stats_dataframe.copy().sum(axis=0)
sums_paper_dataframe = pd.DataFrame(sums_paper)

max_ratings_paper = sums_paper_dataframe.max()
min_ratings_paper = sums_paper_dataframe.min()
mean_ratings_paper = sums_paper_dataframe.mean()

temp_ratings_dataframe = temp_ratings_dataframe.T
paper_counter = 0
for index, row in temp_ratings_dataframe.iterrows():
    if len(np.unique(row)) == 1:
        paper_counter+=1
        
print("---------- COMPUTING STATS FOR READERS ----------")

sums_reader = stats_dataframe.copy().sum(axis=1)
counter=collections.Counter(sums_reader)
sums_reader_dataframe = pd.DataFrame(sums_reader)

max_ratings_reader = sums_reader_dataframe.max()
min_ratings_reader = sums_reader_dataframe.min()
mean_ratings_reader = sums_reader_dataframe.mean()

temp_ratings_dataframe = temp_ratings_dataframe
reader_counter = 0
for index, row in temp_ratings_dataframe.iterrows():
    if len(np.unique(row)) == 1:
        reader_counter+=1
        
# Writing stats to file

stats_dataframe = pd.DataFrame(columns=[
    "Dataset",
    "Max Number Rating Paper", 
    "Min Number Rating Paper", 
    "Mean Number Rating Paper",
    "Number Papers Unique Ratings",
    "Max Number Rating Reader", 
    "Min Number Rating Reader", 
    "Mean Number Rating Reader"
    "Number Readers Unique Rating"
])
stats_dataframe = stats_dataframe.append(
    {
        "Dataset": dataset_name, 
        "Max Number Rating Paper": int(max_ratings_paper.values[0]), 
        "Min Number Rating Paper": int(min_ratings_paper.values[0]), 
        "Number Papers Unique Ratings": paper_counter, 
        "Mean Number Rating Paper": int(mean_ratings_paper.values[0]), 
        "Max Number Rating Reader": int(max_ratings_reader.values[0]), 
        "Min Number Rating Reader": int(min_ratings_reader.values[0]), 
        "Mean Number Rating Reader": int(mean_ratings_reader.values[0]), 
        "Number Readers Unique Rating": reader_counter, 
    }, ignore_index=True)
stats_dataframe.to_csv(stats_file_path, index=False)

print("---------- STATS GENERATION COMPLETED ----------")