# Normaliseren

In [1]:
import numpy as np
import pandas as pd

def missing(df):
    return df[df.isna().any(axis='columns')]

def replace(df, s):
    return (
        df
            .drop(columns=s.name)
            .join(s)
            .reindex(columns=df.columns)
    )

In [2]:
movies = pd.read_parquet('data/movies_clean.parquet')
ratings = pd.read_parquet('data/ratings_clean.parquet')
users = pd.read_parquet('data/users_clean.parquet')

## Vijf klassen
Ter voorbereiding van classificatie, elf is excessief.

In [3]:
ratings = replace(ratings, ((ratings['rating'] - 5) / 5 * 2 + 3).round().astype(np.int))
ratings['rating'].describe()

count    857465.000000
mean          3.931807
std           0.841432
min           1.000000
25%           3.000000
50%           4.000000
75%           5.000000
max           5.000000
Name: rating, dtype: float64

## Doorlopende IDs
Ter voorbereiding van embedding.

### Movies

In [4]:
movies = movies.join(pd.Series(range(1, len(movies) + 1), movies.index, name='movie_id'))
movies = movies[['movie_id', 'imdb_id', 'title', 'year', 'genres']]
movies.head()

Unnamed: 0,movie_id,imdb_id,title,year,genres
0,1,8,Edison Kinetoscopic Record of a Sneeze,1894,"[documentary, short]"
1,2,10,La sortie des usines Lumière,1895,"[documentary, short]"
2,3,12,The Arrival of a Train,1896,"[documentary, short]"
3,4,25,The Oxford and Cambridge University Boat Race,1895,
4,5,91,Le manoir du diable,1896,"[short, horror]"


### Users

In [5]:
len(users)

62798

In [6]:
users['user_id'].max()

67630

In [7]:
users = users.join(pd.Series(range(1, len(users) + 1), users.index, name='user_id_new'))
users.head()

Unnamed: 0,user_id,twitter_id,user_id_new
0,1,139564917,1
1,2,522540374,2
2,3,475571186,3
3,4,215022153,4
4,5,349681331,5


### Ratings

In [8]:
ratings.head()

Unnamed: 0,user_id,imdb_id,rating,date_time
0,1,114508,4,2013-10-05 21:00:50
1,2,208092,3,2020-04-09 21:01:12
2,2,358273,5,2020-01-15 03:10:27
3,2,10039344,3,2020-01-09 20:50:53
4,2,6199572,2,2020-05-14 18:54:43


In [9]:
ratings = (
    ratings
        .merge(movies[['imdb_id', 'movie_id']], 'left', 'imdb_id')
        .drop(columns='imdb_id')
)
ratings.head()

Unnamed: 0,user_id,rating,date_time,movie_id
0,1,4,2013-10-05 21:00:50,8897
1,2,3,2020-04-09 21:01:12,10930
2,2,5,2020-01-15 03:10:27,12938
3,2,3,2020-01-09 20:50:53,35241
4,2,2,2020-05-14 18:54:43,32579


In [10]:
ratings = (
    ratings
        .merge(users[['user_id', 'user_id_new']], 'left', 'user_id')
        .drop(columns='user_id')
)
ratings.head()

Unnamed: 0,rating,date_time,movie_id,user_id_new
0,4,2013-10-05 21:00:50,8897,1
1,3,2020-04-09 21:01:12,10930,2
2,5,2020-01-15 03:10:27,12938,2
3,3,2020-01-09 20:50:53,35241,2
4,2,2020-05-14 18:54:43,32579,2


In [11]:
ratings = ratings[['user_id_new', 'movie_id', 'rating', 'date_time']]
ratings.columns = ['user_id', 'movie_id', 'rating', 'date_time']
ratings.head()

Unnamed: 0,user_id,movie_id,rating,date_time
0,1,8897,4,2013-10-05 21:00:50
1,2,10930,3,2020-04-09 21:01:12
2,2,12938,5,2020-01-15 03:10:27
3,2,35241,3,2020-01-09 20:50:53
4,2,32579,2,2020-05-14 18:54:43


Verwijder tijdelijke kolom.

In [12]:
users = users[['user_id_new', 'twitter_id']]
users.columns = ['user_id', 'twitter_id']
users.head()

Unnamed: 0,user_id,twitter_id
0,1,139564917
1,2,522540374
2,3,475571186
3,4,215022153
4,5,349681331


# Bewaren

In [13]:
movies.to_parquet('data/movies_normalized.parquet')
ratings.to_parquet('data/ratings_normalized.parquet')
users.to_parquet('data/users_normalized.parquet')