# Pre-processing dei dati

In questo notebook vengono normalizzati i dati, in particolare ai rating di ogni utente gli viene sottratto il rating medio dell'utente stesso, così da rendere gli utenti confrontabili.

In [2]:
# import delle librerie necessarie
import numpy as np
import pandas as pd

In [3]:
ratings_df = pd.read_csv('./data/jester_jokes_ratings.csv')
ratings_df.head()

Unnamed: 0,user_id,number_of_jokes_rated,joke_1,joke_2,joke_3,joke_4,joke_5,joke_6,joke_7,joke_8,...,joke_91,joke_92,joke_93,joke_94,joke_95,joke_96,joke_97,joke_98,joke_99,100
0,1,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,...,2.82,99.0,99.0,99.0,99.0,99.0,-5.63,99.0,99.0,99.0
1,2,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
2,3,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27,...,99.0,99.0,99.0,9.08,99.0,99.0,99.0,99.0,99.0,99.0
3,4,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21,...,99.0,99.0,99.0,0.53,99.0,99.0,99.0,99.0,99.0,99.0
4,5,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6


## Esempio di normalizzazione con 10 utenti e 8 barzellette

In [4]:
# si prendono 10 rating e 8 barzellette
r_df = ratings_df.iloc[0:10, 0:10]
r_df.head()

Unnamed: 0,user_id,number_of_jokes_rated,joke_1,joke_2,joke_3,joke_4,joke_5,joke_6,joke_7,joke_8
0,1,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17
1,2,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34
2,3,49,99.0,99.0,99.0,99.0,9.03,9.27,9.03,9.27
3,4,48,99.0,8.35,99.0,99.0,1.8,8.16,-2.82,6.21
4,5,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61


In [5]:
# visto che 99 indica che il rating non è presente, si sostisce 99 con NaN
r_df['joke_1'] = r_df['joke_1'].replace([99],np.nan)
r_df

Unnamed: 0,user_id,number_of_jokes_rated,joke_1,joke_2,joke_3,joke_4,joke_5,joke_6,joke_7,joke_8
0,1,74,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17
1,2,100,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34
2,3,49,,99.0,99.0,99.0,9.03,9.27,9.03,9.27
3,4,48,,8.35,99.0,99.0,1.8,8.16,-2.82,6.21
4,5,91,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61
5,6,100,-6.17,-3.54,0.44,-8.5,-7.09,-4.32,-8.69,-0.87
6,7,47,,99.0,99.0,99.0,8.59,-9.85,7.72,8.79
7,8,100,6.84,3.16,9.17,-6.21,-8.16,-1.7,9.27,1.41
8,9,100,-3.79,-3.54,-9.42,-6.89,-8.74,-0.29,-5.29,-8.93
9,10,72,3.01,5.15,5.15,3.01,6.41,5.15,8.93,2.52


In [6]:
# si sottrae ad ogni barzelletta valutata dall'utente con id == 1 il relativo rating medio
r_df.iloc[1, 2:] - np.mean(r_df.iloc[1, 2:])

joke_1    4.52875
joke_2    0.15875
joke_3    6.80875
joke_4    4.81875
joke_5   -1.93125
joke_6   -9.21125
joke_7   -0.28125
joke_8   -4.89125
Name: 1, dtype: float64

## Normalizzazione di tutti i rating degli utenti

In [7]:
# si selezionano tutte le colonne delle barzellette
joke_columns = r_df.columns[2:] # le barzellette iniziano dalla seconda colonna
joke_columns

Index(['joke_1', 'joke_2', 'joke_3', 'joke_4', 'joke_5', 'joke_6', 'joke_7',
       'joke_8'],
      dtype='object')

In [8]:
# si creano le funzioni per fare la normalizzazione come mostrato sopra

# funzione per sostituire i rating con valore 99 con NaN
def replace_99(ratings):
    joke_ids = ratings.columns[2:]    
    for joke_id in joke_ids: 
        ratings[joke_id] = ratings[joke_id].replace([99],np.nan)
    return ratings

# funzione per normalizzare i rating degli utenti
def normalization(ratings):
    total_users = ratings.shape[0]
    for i in range(total_users):
        ratings.iloc[i, 2:] = ratings.iloc[i, 2:] - np.mean(ratings.iloc[i, 2:])
    return ratings

# si sostituiscono i NaN con 0
def replace_0(ratings):
    joke_ids = ratings.columns[2:]    
    for joke_id in joke_ids: 
        ratings[joke_id] = ratings[joke_id].replace([np.nan],0)
    return ratings

In [9]:
# normalizzazione di tutti i rating con le funzioni sopra definite
normalized_ratings = replace_99(ratings_df)
normalized_ratings = normalization(normalized_ratings)
# LASCIAMO I NAN
normalized_ratings.head()

Unnamed: 0,user_id,number_of_jokes_rated,joke_1,joke_2,joke_3,joke_4,joke_5,joke_6,joke_7,joke_8,...,joke_91,joke_92,joke_93,joke_94,joke_95,joke_96,joke_97,joke_98,joke_99,100
0,1,74,-4.388108,12.221892,-6.228108,-4.728108,-4.088108,-5.068108,-6.418108,7.601892,...,6.251892,,,,,,-2.198108,,,
1,2,100,1.3337,-3.0363,3.6137,1.6237,-5.1263,-12.4063,-3.4763,-8.0863,...,0.0737,-7.6963,-3.0363,5.1137,-2.9363,-4.8863,0.3137,-2.4063,-7.0663,-1.6763
2,3,49,,,,,1.930612,2.170612,1.930612,2.170612,...,,,,1.980612,,,,,,
3,4,48,,5.691875,,,-0.858125,5.501875,-5.478125,3.551875,...,,,,-2.128125,,,,,,
4,5,91,5.247363,1.357363,-7.422637,-8.642637,-1.892637,-1.652637,3.787363,1.357363,...,1.937363,2.327363,1.017363,1.937363,2.477363,-1.702637,-0.142637,3.297363,-1.452637,-1.652637


In [10]:
normalized_ratings.to_csv('data/normalized_ratings.csv', index = None)