In [144]:
# libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import min, max, col, split, unix_timestamp, mean, median, log10, mode, to_date, lit
from pyspark.sql.types import StructType, StructField, StringType, FloatType

from collections import OrderedDict
import pandas as pd
import gc
import json

In [145]:
# configurations
#region
show_parts = False

rating_input = 'myRating.csv'
user_dataset_input = 'myProfile.csv'
user_dataset_embed_path = 'user_dataset_prep.csv'
anime_dataset_embed_path = 'anime_dataset_prep.csv'
user_dataset_og_path = 'dataset/full/users-details-2023.csv' # just to check the format
user_embed_col_path = 'user_embed_col.json'
anime_embed_col_path = 'anime_embed_col.json'
unified_embed_col_path = 'unified_embed_col.json'

spark = SparkSession\
    .builder \
    .appName("Spark SQL app")\
    .getOrCreate()
#endregion

# input necessities
#region
user_df = spark.read\
    .option("header", True)\
    .option("escape", '"')\
    .option("multiline", True)\
    .option("sep", ",")\
    .csv(user_dataset_input)

rating_schema = StructType([ 
    StructField('anime_id', StringType(), True), 
    StructField('rating', FloatType(), True),
]) 
rating_df = spark.read\
    .option("header", True)\
    .option("escape", '"')\
    .option("multiline", True)\
    .option("sep", ",")\
    .schema(rating_schema)\
    .csv(rating_input)

user_embed_df = spark.read\
    .option("header", True)\
    .option("escape", '"')\
    .option("multiline", True)\
    .option("sep", ",")\
    .csv(user_dataset_embed_path)\
    .limit(1) # one user only

anime_embed_df = spark.read\
    .option("header", True)\
    .option("escape", '"')\
    .option("multiline", True)\
    .option("sep", ",")\
    .csv(anime_dataset_embed_path)

user_embed_dict = json.load(open(user_embed_col_path, 'r'))
anime_embed_dict = json.load(open(anime_embed_col_path, 'r'))
unified_embed_dict = json.load(open(unified_embed_col_path, 'r'))
    
#endregion

In [146]:
# related functions
#region
def embed_onehot(dataFrame, colName: str, embed_dict: dict):
    unified_df = dataFrame.select(colName).dropDuplicates().toPandas()
    unified_df = unified_df.reindex(unified_df.columns.tolist() + list(embed_dict[colName]), axis=1, fill_value=0) # main difference
    input_list = unified_df[colName].tolist()
    for i in range(len(input_list)):
        if input_list[i] is not None:
            temp = input_list[i].split(", ")
            for j in temp:
                if j is not None and colName + "_" + j in list(embed_dict[colName]): # main difference
                    unified_df.at[i, colName+"_"+j] = 1
    unified_df = spark.createDataFrame(unified_df)
    dataFrame = dataFrame.join(unified_df, colName, 'left').drop(colName)

    # clean up mem
    del [[unified_df]]
    gc.collect()

    return dataFrame

def min_max(dataFrame, colName_in: str, colName_out: str, min_val: float, max_val: float):
    dataFrame = dataFrame.withColumn(colName_out, (col(colName_in)-min_val)/(max_val-min_val))                                                                                                                                                                                         #thien7170

    return dataFrame

def embed_minmax(dataFrame, colName: str, embed_dict: dict):
    min_val = embed_dict[colName][0]
    max_val = embed_dict[colName][1]
    dataFrame = dataFrame.withColumn("A1", (col(colName)-min_val)/(max_val - min_val))\
        .drop(colName).withColumnRenamed(existing="A1", new=colName)
    return dataFrame

def main_normalization(dataFrame, colName_in: str, colName_out: str, min_range: float = 0, max_range: float = 1):
    return min_max(dataFrame, colName_in, colName_out, min_range, max_range)
#endregion

In [147]:
# mean score
rating_df_mean = rating_df.select(mean('rating')).collect()[0][0]
rating_df = rating_df.withColumn('rating_m', col('rating')-rating_df_mean)\
    .drop('rating').withColumnRenamed(existing='rating_m', new='rating')
if show_parts:
    rating_df.show()
    user_df.show()

+--------+-------------------+
|anime_id|             rating|
+--------+-------------------+
|      46|-0.9090909090909083|
|     401|0.09090909090909172|
|      44|0.09090909090909172|
|      24|0.09090909090909172|
|     846|0.09090909090909172|
|    4015|0.09090909090909172|
|     517|0.09090909090909172|
|     400|0.09090909090909172|
|     861|0.09090909090909172|
|     793|0.09090909090909172|
|    3091|0.09090909090909172|
+--------+-------------------+

+---------------+-------------+------+--------------------+--------------------+--------------------+------------+----------+--------+---------+-------+-------+-------------+-------------+---------+----------------+
|         Mal ID|     Username|Gender|            Birthday|            Location|              Joined|Days Watched|Mean Score|Watching|Completed|On Hold|Dropped|Plan to Watch|Total Entries|Rewatched|Episodes Watched|
+---------------+-------------+------+--------------------+--------------------+--------------------+-

In [148]:
# other user embeddings
#region
user_df = user_df.replace({'UNKNOWN': None, 'Unknown': None})

user_df = user_df.drop('Username')

user_df = embed_onehot(user_df, 'Gender', user_embed_dict)

split_col = split(user_df['Birthday'], 'T', 2)
user_df = user_df.withColumn('Birthday_d', split_col.getItem(0))\
    .withColumn('Birthday_unix', unix_timestamp('Birthday_d', format='yyyy-mm-dd'))\
    .drop('Birthday', 'Birthday_d')\
    .withColumnRenamed('Birthday_unix', 'Birthday')
user_df = embed_minmax(user_df, 'Birthday', user_embed_dict)

user_df = user_df.drop('Location')

user_df = user_df.drop('Joined')

user_df = user_df.withColumnRenamed('Days Watched', 'Days_Watched')
user_df = user_df.withColumn('Days_Watched_log', log10(col('Days_Watched')+1.0))\
    .drop('Days_Watched')\
    .withColumnRenamed('Days_Watched_log', 'Days_Watched')

user_df = embed_minmax(user_df, 'Days_Watched', user_embed_dict)

user_df = user_df.drop('Watching', 'On Hold', 'Dropped', 'Plan to Watch', 'Total Entries')

user_df = user_df.drop('Rewatched')\
    .withColumnRenamed('Episodes Watched', 'Episodes_Watched')
a, b = 'Episodes_Watched', 'Completed'
user_df = user_df.withColumn(a + '_l', log10(col(a)+1.0))\
    .withColumn(b+ '_l', log10(col(b)+1.0))\
    .drop(a, b)\
    .withColumnRenamed(a+'_l', a)\
    .withColumnRenamed(b+'_l', b)

user_df = embed_minmax(user_df, b, user_embed_dict)
user_df = embed_minmax(user_df, a, user_embed_dict)
#endregion

In [149]:
if show_parts:
      print(list(user_df.columns))
      print(list(user_embed_df.columns))
      print(list(user_df.columns) == list(user_embed_df.columns))
      user_df.show()

['Mal ID', 'Mean Score', 'Gender_Male', 'Gender_Female', 'Gender_Non-Binary', 'Birthday', 'Days_Watched', 'Completed', 'Episodes_Watched']
['Mal ID', 'Mean Score', 'Gender_Male', 'Gender_Female', 'Gender_Non-Binary', 'Birthday', 'Days_Watched', 'Completed', 'Episodes_Watched']
True




+---------------+----------+-----------+-------------+-----------------+--------------------+-------------------+------------------+------------------+
|         Mal ID|Mean Score|Gender_Male|Gender_Female|Gender_Non-Binary|            Birthday|       Days_Watched|         Completed|  Episodes_Watched|
+---------------+----------+-----------+-------------+-----------------+--------------------+-------------------+------------------+------------------+
|901231748791623|      NULL|          1|            0|                0|1.043094347922247...|0.26448090787893097|0.2618443201266776|0.3352274810471495|
+---------------+----------+-----------+-------------+-----------------+--------------------+-------------------+------------------+------------------+



                                                                                

In [150]:
# anime embed
if show_parts:
    anime_embed_df.sort(col('anime_id').cast('int')).show()

[Stage 601:>                                                        (0 + 1) / 1]

+--------+------------------+-------------+--------------------+-------------+----------------+------------+--------------+-------------------+--------------+-------------+-------------+--------------+--------------------+---------------+------------+--------------+------------------+-------------+-----------------+----------------+-------------+--------------+-------+--------+------------+----------+--------+----------+-------------------+------------------+----------------------+-----------------------+--------------------+-----------------+------------------+----------------------------------+------------------+--------------+------------------+------------------+-------------------+-----------------------------------+------------------+----------------+---------------------+------------------+--------------------+-----------------+---------------+---------------------+------------------------+----------------+--------------+------------------+-------------+-------------------------

                                                                                

In [151]:
# join
unified_df = user_df.select('Mal ID').crossJoin(rating_df).select(['Mal ID', 'anime_id', 'rating'])
unified_df = unified_df.join(user_df, 'Mal ID', 'left').withColumnRenamed(existing='Mal ID',new='user_id')\
    .join(anime_embed_df, 'anime_id', 'left')
unified_df.show()

                                                                                

+--------+---------------+-------------------+----------+-----------+-------------+-----------------+--------------------+-------------------+------------------+------------------+------------------+-------------+--------------------+-------------+----------------+------------+--------------+-------------------+--------------+-------------+-------------+--------------+--------------------+---------------+------------+--------------+------------------+-------------+-----------------+----------------+-------------+--------------+-------+--------+------------+----------+--------+----------+--------------------+------------------+----------------------+-----------------------+--------------------+-----------------+------------------+----------------------------------+------------------+--------------+------------------+------------------+-------------------+-----------------------------------+------------------+----------------+---------------------+------------------+--------------------+

In [152]:
# whitespace -> underscore
to_change = {}
column_order = list(unified_df.columns)
for i in range(len(column_order)):
    if ' ' in column_order[i]:
        new_name = column_order[i].replace(' ', '_')

        to_change[column_order[i]] = new_name
        column_order[i] = new_name

unified_df = unified_df.withColumnsRenamed(to_change)

# remove illigible characters
import re
to_change = {}
for i in range(len(column_order)):
    new_name = re.sub("[!@#$%^&*().,']","", column_order[i]) 

    if new_name != column_order[i]:
        to_change[column_order[i]] = new_name
        column_order[i] = new_name

unified_df = unified_df.withColumnsRenamed(to_change)

In [153]:
unified_df = unified_df.drop('Mean_Score')
print(sorted(unified_df.columns))
print(sorted(unified_embed_dict['unified_embed']))
unified_df.select(unified_embed_dict['unified_embed']).show()

['Birthday', 'Completed', 'Days_Watched', 'Duration', 'Episodes', 'Episodes_Watched', 'Favorites', 'Gender_Female', 'Gender_Male', 'Gender_Non-Binary', 'Genres_Action', 'Genres_Adventure', 'Genres_Avant_Garde', 'Genres_Award_Winning', 'Genres_Boys_Love', 'Genres_Comedy', 'Genres_Drama', 'Genres_Ecchi', 'Genres_Erotica', 'Genres_Fantasy', 'Genres_Girls_Love', 'Genres_Gourmet', 'Genres_Hentai', 'Genres_Horror', 'Genres_Mystery', 'Genres_Romance', 'Genres_Sci-Fi', 'Genres_Slice_of_Life', 'Genres_Sports', 'Genres_Supernatural', 'Genres_Suspense', 'Licensors_ADV_Films', 'Licensors_Aniplex_of_America', 'Licensors_Bandai_Entertainment', 'Licensors_Central_Park_Media', 'Licensors_Crunchyroll', 'Licensors_Discotek_Media', 'Licensors_Funimation', 'Licensors_Geneon_Entertainment_USA', 'Licensors_Kitty_Media', 'Licensors_Media_Blasters', 'Licensors_Nozomi_Entertainment', 'Licensors_Sentai_Filmworks', 'Licensors_VIZ_Media', 'Members', 'Premiered', 'Producers_AT-X', 'Producers_Aniplex', 'Producers_A

                                                                                

+---------------+--------+-----------+-------------+-----------------+--------------------+-------------------+------------------+------------------+------------------+-------------+--------------------+-------------+----------------+------------+--------------+-------------------+--------------+-------------+-------------+--------------+--------------------+---------------+------------+--------------+------------------+-------------+-----------------+----------------+-------------+--------------+-------+--------+------------+----------+--------+----------+--------------------+------------------+----------------------+-----------------------+--------------------+-----------------+------------------+----------------------------------+------------------+--------------+------------------+------------------+-------------------+-----------------------------------+------------------+----------------+---------------------+------------------+--------------------+-----------------+-------------

In [None]:
spark.stop()