In [32]:
import pandas as pd
import seaborn as sb
import numpy as np
import math as m
import random as rn
from matplotlib import pyplot as plt

papers_number = 35
readers_number = 80
ratings_number = readers_number * papers_number
authors_number = 10

papers = np.arange(papers_number)
readers = np.arange(readers_number)
ratings = np.arange(ratings_number)
authors = np.arange(authors_number)

# Seed folder path

dataset_name = "seed_1"
seed_folder_path = f"data/seed/{dataset_name}/"
info_filename = f"{seed_folder_path}info.csv"
ratings_filename = f"{seed_folder_path}ratings.csv"
authors_filename = f"{seed_folder_path}authors.csv"

# Info file generation

info_dataframe = pd.DataFrame(columns=["Dataset", "Paper", "Reader", "Rating", "Author"])
info_dataframe = info_dataframe.append(
    {
        "Dataset": dataset_name.capitalize(), 
        "Paper": papers_number, 
        "Reader": readers_number, 
        "Rating": ratings_number, 
        "Author": authors_number
    }, ignore_index=True)
info_dataframe.to_csv(info_filename, index=False)

# Ratings file generation

ratings_dataframe = pd.DataFrame(columns=["Timestamp", "Reader", "Paper", "Score", "Author"])
tuples = []
print("---------- RANDOM RATINGS GENERATION STARTED ----------")
generated_ratings = 0
for reader in readers:
    for index, paper in enumerate(papers):
        percentage = round((100*generated_ratings/(readers_number*papers_number)),2) 
        if percentage % 5 == 0:
            print(f"{int(percentage)}/100%", end=" ")
        current_tuple = {
            "Reader": reader, 
            "Paper": paper, 
            "Score": round(rn.uniform(0, 1), 2), 
            "Author": np.random.choice(authors_number, 1)[0]
        }
        generated_ratings+=1
        tuples.append(current_tuple)
print("100/100%")
print("---------- RANDOM RATINGS GENERATION ENDED ----------")
print("---------- RATINGS SHUFFLING STARTED ----------")
rn.shuffle(tuples)
print("---------- RATINGS SHUFFLING ENDED ----------")
print("---------- RATINGS APPENDING STARTED ----------")
for index, current_tuple in enumerate(tuples):
    percentage = round((100*index/len(tuples)),2)
    if percentage % 5 == 0:
        print(f"{int(percentage)}/100%", end=" ")
    ratings_dataframe = ratings_dataframe.append(
        {
            "Timestamp": index,
            "Reader": current_tuple["Reader"],
            "Paper": current_tuple["Paper"], 
            "Score": current_tuple["Score"], 
            "Author": current_tuple["Author"]
        }, ignore_index=True)
print()
print("---------- RATINGS APPENDING ENDED ----------")
ratings_dataframe["Timestamp"] = ratings_dataframe["Timestamp"].astype(int)
ratings_dataframe["Reader"] = ratings_dataframe["Reader"].astype(int)
ratings_dataframe["Paper"] = ratings_dataframe["Paper"].astype(int)
ratings_dataframe["Author"] = ratings_dataframe["Author"].astype(int)
ratings_dataframe.to_csv(ratings_filename, index=False)

print("---------- RANDOM AUTHORS GENERATION STARTED ----------")
print("---------- RATINGS APPENDING STARTED ----------")
authors_dataframe = pd.DataFrame(columns=["Author", "Papers"])
for author in authors:
    # An author writes a number of paper between 1 and paper_fraction
    papers_fraction = int(round((100 * (5 / papers_number))))
    author_papers_number = rn.randint(1, papers_fraction)
    papers_written = np.random.choice(papers, author_papers_number).tolist()
    papers_written = set(papers_written)
    papers_written = map(str, list(papers_written))
    papers_written = ";".join(papers_written)
    authors_dataframe = authors_dataframe.append(
        {
            "Author": author,
            "Papers": papers_written
        }, ignore_index=True)
print("---------- RANDOM AUTHORS GENERATION ENDED ----------")
print("---------- RATINGS APPENDING ENDED ----------")
authors_dataframe.to_csv(authors_filename, index=False)
    

---------- RANDOM RATINGS GENERATION STARTED ----------
0/100% 5/100% 10/100% 15/100% 20/100% 25/100% 30/100% 35/100% 40/100% 45/100% 50/100% 55/100% 60/100% 65/100% 70/100% 75/100% 80/100% 85/100% 90/100% 95/100% 100/100%
---------- RANDOM RATINGS GENERATION ENDED ----------
---------- RATINGS SHUFFLING STARTED ----------
---------- RATINGS SHUFFLING ENDED ----------
---------- RATINGS APPENDING STARTED ----------
0/100% 

5/100% 10/100%

 15/100% 

20/100% 

25/100% 

30/100% 

35/100% 40/100%

 45/100% 

50/100% 55/100% 

60/100% 65/100% 

70/100% 75/100%

 80/100% 

85/100% 90/100% 

95/100% 
---------- RATINGS APPENDING ENDED ----------
---------- RANDOM AUTHORS GENERATION STARTED ----------
---------- RATINGS APPENDING STARTED ----------
---------- RANDOM AUTHORS GENERATION ENDED ----------
---------- RATINGS APPENDING ENDED ----------


In [33]:
# Summary

print("RANDOM INFO:       ", info_dataframe)
print("RANDOM RATINGS:  ", ratings_dataframe.head(10))
print("RANDOM AUTHORS:  ", authors_dataframe.head(10))

RANDOM INFO:          Dataset Paper Reader Rating Author
0  Seed_1    35     80   2800     10
RANDOM RATINGS:      Timestamp  Reader  Paper  Score  Author
0          0      49      9   0.27       7
1          1      60      8   0.33       0
2          2      15     16   0.23       7
3          3       2     22   0.63       1
4          4      49      5   0.35       0
5          5      69      1   0.13       9
6          6      60     29   0.56       1
7          7      74     20   0.05       9
8          8       6     10   0.19       3
9          9      77     24   0.99       0
RANDOM AUTHORS:     Author                             Papers
0      0                          1;18;12;4
1      1           2;6;17;18;25;26;27;28;31
2      2  32;33;1;9;14;15;17;18;19;20;25;27
3      3                      34;4;13;23;26
4      4                                 29
5      5                    2;34;6;28;29;31
6      6       32;0;34;3;4;8;10;12;20;21;23
7      7                          1;21;9;23
8