In [48]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import pickle
def save_pickle(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)
def load_pickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [213]:
# Load result data from Preprocessing_1_IMDb
df = load_pickle("rating_movie_user.pickle")

In this notebook, we will primarily focus on extracting new features from existing columns. The normalization, categorization, and embedding will be done in the model_ranking notebook.

### User Feature

In [214]:
# Turn gender into 0/1
df['gender'] = df['gender'].apply(lambda x : 1 if x == "M" else 0)

In [215]:
# Bucketize zipcode by the first two digits
df['zipcode_bucket'] = df['zip code'].apply(lambda x: str(x)[:2])

### Movie Feature

We gather most of the additional features through IMDb api in the *Preprocessing_1_IMDb* notebook. We just extract release year and top 2 genres and do renaming here.

In [220]:
# Extract release year from release date
df['release_year'] = pd.DatetimeIndex(df['release_date']).year

In [221]:
# List all genre_cols
genre_cols = ['unknown', 'Action', 'Adventure',
       'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance',
       'Sci-Fi', 'Thriller', 'War', 'Western']

# Function to return the top 2 genres for each row
def get_top2_genre(cols):
    indexes = np.where(cols == 1)
    return pd.Series([genre_cols[indexes[0][0]] if len(indexes[0]) > 0 else None,
                      genre_cols[indexes[0][1]] if len(indexes[0]) >1 else None]) 

In [222]:
# Extract two movie genre columns for each movie
df[['movie_genre_1', 'movie_genre_2']] = df[genre_cols].progress_apply(get_top2_genre, axis= 1)

100%|██████████| 99863/99863 [00:13<00:00, 7527.77it/s]


In [223]:
# Rename to IMDb_rating
df['IMDb_rating'] = df['rating_y']

In [235]:
# Split top2 cast into separate columns
def get_top2_cast(top2_list):
    return pd.Series([top2_list[0], 
                      top2_list[1] if len(top2_list) > 1 else None])

df[['cast_1', 'cast_2']] = df['top2_cast'].progress_apply(get_top2_cast)

100%|██████████| 99863/99863 [00:07<00:00, 13714.01it/s]


### Rating Feature

In [236]:
# Rename rating_x to rating
df['rating'] = df['rating_x']

In [237]:
# Extract avg,std,count on rating for each user, order by timestamp, e.g. no future information leakage
df1 = df.sort_values(['userId', 'timestamp'], ascending=[True, True]).reset_index()
user_avg_rating = df1.groupby(df1.userId)['rating'].expanding().mean().reset_index()
user_std_rating = df1.groupby(df1.userId)['rating'].expanding().std().reset_index()
user_rating_count = df1.groupby(df1.userId)['rating'].expanding().count().reset_index()

df1['user_avg_rating'] = user_avg_rating['rating']
df1['user_std_rating'] = user_std_rating['rating'].fillna(0)
df1['user_rating_count'] = user_rating_count['rating']

In [238]:
# Sanity Checking to see if these features are ordered by timestamp
df1[df1.userId == 172][['userId', 'timestamp', 'rating','user_avg_rating', 'user_std_rating', 'user_rating_count']][:5]

Unnamed: 0,userId,timestamp,rating,user_avg_rating,user_std_rating,user_rating_count
16391,172,875536498,3,3.0,0.0,1.0
16392,172,875536591,1,2.0,1.414214,2.0
16393,172,875536721,2,2.0,1.0,3.0
16394,172,875537099,1,1.75,0.957427,4.0
16395,172,875537151,4,2.2,1.30384,5.0


In [239]:
# Extract avg,std,count on rating for each movie, order by timestamp, e.g. no future information leakage
df2 = df1.sort_values(['movieId', 'timestamp'], ascending=[True, True]).reset_index()
movie_avg_rating = df2.groupby(df2.movieId)['rating'].expanding().mean().reset_index()
movie_std_rating = df2.groupby(df2.movieId)['rating'].expanding().std().reset_index()
movie_rating_count = df2.groupby(df2.movieId)['rating'].expanding().count().reset_index()

df2['movie_avg_rating'] = movie_avg_rating['rating']
df2['movie_std_rating'] = movie_std_rating['rating'].fillna(0)
df2['movie_rating_count'] = movie_rating_count['rating']

In [240]:
# Sanity Checking to see if these features are ordered by timestamp
df2[df2.movieId == 172][['movieId', 'timestamp', 'rating','movie_avg_rating', 'movie_std_rating', 'movie_rating_count']][:5]

Unnamed: 0,movieId,timestamp,rating,movie_avg_rating,movie_std_rating,movie_rating_count
23857,172,874729901,5,5.0,0.0,1.0
23858,172,874782191,4,4.5,0.707107,2.0
23859,172,874785889,4,4.333333,0.57735,3.0
23860,172,874792435,5,4.5,0.57735,4.0
23861,172,874829883,5,4.6,0.547723,5.0


In [254]:
# Get favorite movie genre for each user, order by timestamp, e.g. no future information leakage
df3 = df2.copy()
df3 = df2.sort_values(['userId', 'timestamp'], ascending=[True, True]).reset_index(drop=True)

# sum up counts for each genre, for ratings >= 4
for g in genre_cols:
    like_sum = df3.groupby(df3.userId).apply(lambda x: x[x['rating_x'] >= 4])[g].expanding().sum().reset_index()
    df3[g + '_sum'] = like_sum[g]

movie_genres_sum = [g + '_sum' for g in genre_cols]
# find genre with highest count
df3['user_fav_genre']= df3[movie_genres_sum].idxmax(axis=1)
df3['user_fav_genre'] = df3['user_fav_genre'].str.replace('_sum', '')

In [256]:
# Sanity check to see if favourite genre for a user is correct
df3[df3.userId == 1][['userId', 'timestamp', 'user_fav_genre', 'rating'] + genre_cols][:5]

Unnamed: 0,userId,timestamp,user_fav_genre,rating,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,874965478,Comedy,5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,874965478,Action,5,0,1,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,0
2,1,874965518,Drama,5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,1,874965556,Drama,4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
4,1,874965677,Drama,5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [263]:
df4 = df3.copy()

# concat the rating and movie id for each row to a new column called 'rating_and_movie'
df4['rating_and_movie'] = df4['rating'].map(str) + df4['movieId'].map(str) 
df4['rating_and_movie'] = df4['rating_and_movie'].astype(int)

# for each user, find the rolling max of 'rating_and_movie'
temp1 = df4.groupby(df4.userId)['rating_and_movie'].expanding().max().reset_index()

# remove the rating digit, and keep the movie id
df4['user_fav_movieId'] = temp1['rating_and_movie'].astype(int).map(str).str[1:].astype(int)

### Save features to disk

In [264]:
# Save all the important features to file
result_df = df4[['userId', 'age', 'gender', 'occupation', 'zipcode_bucket',               # User Features
                 'movieId', 'movie_genre_1', 'movie_genre_2', 'IMDb_rating',              # Movie Features
                 'director', 'cast_1', 'cast_2', 'plot embedding', 'release_year',        # Movie Features
                 'rating', 'user_avg_rating', 'user_std_rating', 'user_rating_count',     # Rating Features
                 'movie_avg_rating', 'movie_std_rating', 'movie_rating_count',            # Rating Features
                 'user_fav_genre', 'user_fav_movieId', 'timestamp'                        # Rating Features        
               ]]
save_pickle(result_df, 'data.pickle')