# ML4 : Implémentation d'un algorithme pour recommander des films à partir de ses propres recommandations d'autres films

## Libs and config

In [9]:
import numpy as np
import pandas as pd
import keras
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline

from collections import defaultdict
from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.layers import Embedding, Input, Reshape
from keras.layers import Dot

from sklearn.metrics import mean_squared_error
from math import sqrt

from keras.layers import Add
from keras import regularizers
from keras.callbacks import EarlyStopping

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px

from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.manifold import TSNE


## Functions

In [10]:
def get_train_test_sets(data_path, train_prop = 0.9):
    """
    Build train and test sets and reindex userIds and MovieIds from 0 with contiguous indexes.
    
    Input: 
        data_path : string : the path to the ratings file
        train_prop : float : The proportion of the training set 
    
    Output:
        train : pandas.DataFrame : A dataframe with columns [userId, movieId, rating, timestamp], where
            the userId and movieId value have been replaced with new ids starting at 0. 
            Contains `train_prop` random entries from the input file.
        test : pandas.DataFrame : Same as `train`, contains the 1 - `train_prop` remaining entries.
        nb_users : int : Number of unique user ids
        nb_movies : int : Number of unique movie ids
        user_ids_map : dict : A mapping of original file userId to a new index starting at 0.
            Keys are int from the original userId column, values are int of the new indexation.
        movie_ids_map : dict : Same as `user_ids_map` for the movieIds.
    """
      
    df = pd.read_csv(data_path)   
    nb_users = len(df['userId'].unique())
    nb_movies = len(df['movieId'].unique())

    #changement des index pour userId : on passe de 1 à 610 à 0 à 609
    users_old_index = np.unique(df['userId'])
    users_new_index = np.linspace(0, len(df['userId'].unique())-1, len(df['userId'].unique()), dtype=int)
    user_ids_map = dict(zip(users_old_index,users_new_index)) #{"old":"new"}

    #changement des index pour moviesId : on passe de 1 à 193609 à 0 à 9724
    movie_old_index = np.unique(df['movieId'])
    movie_new_index = np.linspace(0, len(df['movieId'].unique())-1, len(df['movieId'].unique()), dtype=int) 
    movie_ids_map = dict(zip(movie_old_index,movie_new_index)) #{"old":"new"}
    
    df = df.replace(to_replace={"userId" : user_ids_map, "movieId" : movie_ids_map})
    
    train, test = train_test_split(df, train_size=train_prop)
    return train, test, nb_users, nb_movies, user_ids_map, movie_ids_map

In [11]:
def get_ratings_of_user(df, userid, path_movies_info="../data/ml-latest-small/movies.csv", data_path='../data/ml-latest-small/ratings.csv'):
    movie_names = pd.read_csv(path_movies_info)
    movie_names_id_dict = dict(zip(movie_names['movieId'], movie_names['title'])) #{"old":"new"}

    movie_ids_map = get_train_test_sets(data_path=data_path)[5]

    user_1_ratings = (df.loc[df["userId"]==userid,:]
                    .replace(to_replace={'movieId' : {v: k for k, v in movie_ids_map.items()}})
                    .replace(to_replace={'movieId' : movie_names_id_dict })
                    .rename(columns={'movieId' : 'movieTitle'})
                    )
    return user_1_ratings

In [12]:
def get_mf_bias_l2_reg_model(nb_users, nb_movies, k, lambda_):
    """
    Build a smatrix factorization model with user and movie biases, and L2 regularization
    
    Input:
        nb_users : int : The number of unique users
        nb_movies : int : The number of unique movies
        k : int : The size of the embeddings
        
    Output:
        model : keras.models.Model : A keras model that implements matrix factorization with biases
            and L2 regularization
        
    """
    dim_embedddings = k
    
    # User embeddings
    u = Input(shape=(1,), dtype='int32', name = 'u__user_id')
    p_u = Embedding(nb_users, dim_embedddings, name="p_u__user_embedding", embeddings_regularizer=regularizers.L2(l2=lambda_))(u)
    p_u = Reshape((dim_embedddings,), name="p_u__user_embedding_reshaped")(p_u)
    
    # Movie embeddings
    i = Input(shape=(1,), dtype='int32', name = 'i__movie_id')
    q_i = Embedding(nb_movies, dim_embedddings, name="q_i__movie_embedding", embeddings_regularizer=regularizers.L2(l2=lambda_))(i)
    q_i = Reshape((dim_embedddings,), name="q_i__movie_embedding_reshaped")(q_i)
    
    # Dot product
    d = Dot(axes = 1)([p_u, q_i])
    
    
    # Bias 
    b_i = Embedding(nb_movies, 1, name="b_i__movie_embedding")(i)
    b_i = Reshape((1,), name="b_i__movie_embedding_reshaped")(b_i)

    b_u = Embedding(nb_users, 1, name="b_u__user_embedding")(u)
    b_u = Reshape((1,), name='b_u__user_embedding_reshaped')(b_u)

    b_ui = Add()([b_i,b_u])
    output = Add()([b_ui,d])


    #We define our model by giving its input and outputs, in our case
    #the user and movie ids will be the inputs, and the output will be
    #the estimated rating d, that is the dot product of the 
    #corresponding embeddings.
    model = Model(inputs=[u, i], outputs=output) 
    # outputs = b_i + b_u + d

    
    #Finally, we define the loss and metric to use, in our case the mean squared error,
    #along with the optimization method, we'll understand what is 'adam' later also.
    model.compile(loss='mse', optimizer='adam', metrics=["mse"])

    
    return model

In [13]:
def grid_search(data, param_grid, get_model_function, nb_users, nb_movies, validation_size = 0.1):
    """
    Performs a grid search over the 
    
    Input:
        data : DataFrame : The training set to be split between training and validation sets
        param_grid : dict : Dictionary containing the values of the hyper-parameters to grid-search
        get_model_function : function : A function that returns the keras model to grid-search
        nb_users : int : The number of unique users
        nb_movies : int : The number of unique movies
        validation_size : float : Proportion of the validation set
        
    Output:
        best_params : dict : A dictionary of the best hyper-parameters values
        best_score : float : The validation RMSE corresponding to the best
        best_model : keras.Model : The model trained with the best hyper-parameters
        
    """
    
    rmse_liste=[]
    k_liste=[]
    lambda_liste=[]
    model_liste=[]
    
    X_train = [data["userId"].to_numpy(), data["movieId"].to_numpy()]
    y_train = data["rating"].to_numpy()
    
    for k in param_grid['k']:
        for lambda_ in param_grid['lambda_']:
            mf_bias_reg_model = get_model_function(nb_users, nb_movies, k, lambda_)
            early_stopping = EarlyStopping(monitor='val_mse', patience=5, verbose=1, restore_best_weights=True)
            history = mf_bias_reg_model.fit(X_train, y_train, epochs=500, batch_size=512, validation_split=validation_size, callbacks=[early_stopping])
            mse = min(history.history['mse'])
            rmse = sqrt(mse)
            rmse_liste.append(rmse)
            k_liste.append(k)
            lambda_liste.append(lambda_)
            model_liste.append(mf_bias_reg_model)
            

    df_results = pd.DataFrame(
        {'score' : rmse_liste,
        'k' : k_liste,
        'lambda_' : lambda_liste,
        'model' : model_liste
        }
    )
    max_score = min(df_results['score'])
    best_score = df_results.loc[df_results['score']==max_score, 'score'].values[0]
    best_k= df_results.loc[df_results['score']==max_score, 'k'].values[0]
    best_lambda= df_results.loc[df_results['score']==max_score, 'lambda_'].values[0]
    best_params = {'k' : best_k, 'lambda_' : best_lambda}
    best_model = df_results.loc[df_results['score']==max_score, 'model'].values[0]

    return best_params, best_score, best_model

In [14]:
def get_top5_for_user(model, user_id, dataset):
    """
    Returns a list of the 5 movies that have the highest ratings among the unrated movies
    of user `user_id`, along with a list of their predicted ratings.
    
    Input :
        model : keras.models.Model : A trained matrix factorization model
        user_id : int : The user id to use
        dataset : DataFrame : The whole dataset, useful to find the movies 
            the user `user_id` has already rated
    
    Output :
        five_best_movie_ids : list : The five movie ids among unrated movies by user `user_id` 
            that have the highest predicted ratings, in order
        five_best_ratings : list : The corresponding five ratings
    """
    
    #TOFILL
    user_id_selected = user_id

    #ids of movies rated by the user : [array]
    rated_movies = dataset.loc[dataset['userId']==user_id_selected, "movieId"].values

    #ids of all movies : [array]
    all_movies = dataset.movieId.unique()

    #nouveau array avec userid d'un côté et movies non regardées de l'autre
    unrated_movies_by_user = np.array([movieid for movieid in all_movies if movieid not in rated_movies])
    user_id_array = np.repeat(user_id_selected, repeats=len(unrated_movies_by_user))
    array_to_predict = [user_id_array, unrated_movies_by_user]

    #prédiction avec ce nouvel array
    user_all_predictions = pd.DataFrame(
        {   'userId' : list(user_id_array),
            'movieId' : list(unrated_movies_by_user),
            'ratings' : list(model.predict(array_to_predict)[:,0]) #ratings
        })

    #sélection des 5 tops prédictions
    user_top5_predictions = user_all_predictions.sort_values(by='ratings', ascending=False).reset_index().iloc[0:5,:].drop(columns=['index'])

    #récupération des 5 tops prédictions
    five_best_movie_ids =user_top5_predictions['movieId'].values.astype(int).tolist()
    five_best_ratings = [round(rating,2) for rating in user_top5_predictions['ratings'].values.tolist()]
    
    return five_best_movie_ids, five_best_ratings

## Dataset

In [15]:
ratings_s_path =  '../data/ml-latest-small/ratings.csv'
movie_names_path = "../data/ml-latest-small/movies.csv"

train, test, nb_users, nb_movies, user_ids_map, movie_ids_map = get_train_test_sets(ratings_s_path)

dataset = pd.concat((train,test), axis = 0)

print("There are %i movies, %i users, and %i ratings" % (nb_movies, nb_users, dataset.shape[0]))

There are 9724 movies, 610 users, and 100836 ratings


### Adding new entries

In [141]:
#loading movie data
movie_names = pd.read_csv(movie_names_path)


#select 50 random movies
random_movies = np.random.choice(a=movie_names['title'], size=75)
random_movies = pd.DataFrame({'title' : random_movies})

#export the 50 movies to a csv
random_movies.to_csv('../movies_to_rate/movies_to_rate_231214.csv', index=False)


#select and export 50 random movies >1990
movies_recent = movie_names.title.str.split(pat='(\()([\d]{4})', expand=True, regex=True)
movies_recent = movie_names.assign(movie_year = movies_recent.loc[:, 2]).dropna().astype(dtype= {'movie_year' : int}, errors='ignore')
movies_recent = movies_recent.loc[movies_recent['movie_year']>1989,:]

random_movies = np.random.choice(a=movies_recent['title'], size=75)
random_movies = pd.DataFrame({'title' : random_movies})

random_movies.to_csv('../movies_to_rate/movies_to_rate_231214.csv', index=False)

#rate at least 20 movies from 0 to 5




In [150]:
ratings_s_path =  '../data/ml-latest-small/ratings.csv'
path_new_data="../data/rated_movies_221214_1545.csv"
movie_names_path = "../data/ml-latest-small/movies.csv"

#loading actual data
data_path=ratings_s_path
df = pd.read_csv(data_path)  

#loading new data
df_new_data = pd.read_csv(path_new_data).rename(columns={'Film' : 'title'})
df_new_data = pd.melt(df_new_data, id_vars=['title'], var_name='userName', value_name='rating')

#créer un nouvel userId pour chaque userName unique :
liste_user_name = df_new_data.userName.unique().tolist()

new_user_id = {'userName':[],
               'userId': []}

i=1
for username in liste_user_name:
    new_user_id['userName'].append(username)
    new_user_id['userId'].append(np.max(df.userId)+i)
    i=i+1

new_user_id



Unnamed: 0,title,userName,rating
0,Let the Right One In (Låt den rätte komma in) ...,alex,
1,Lolita (1962),alex,
2,Good Copy Bad Copy (2007),alex,
3,Blazing Saddles (1974),alex,
4,Hare-um Scare-um (1939),alex,
...,...,...,...
670,With Honors (1994),Greg,
671,Chasers (1994),Greg,
672,"Young Poisoner's Handbook, The (1995)",Greg,
673,"World's Fastest Indian, The (2005)",Greg,


### Splitting dataset

In [16]:
X_train = [train["userId"].to_numpy(), train["movieId"].to_numpy()]
y_train = train["rating"].to_numpy()

X_test = [test["userId"].to_numpy(), test["movieId"].to_numpy()]
y_test = test["rating"].to_numpy()

In [17]:
movie_names = pd.read_csv(movie_names_path)
movie_names_id_dict = dict(zip(movie_names['movieId'], movie_names['title'])) #{"old":"new"}



## Ratings of user x

In [19]:
ratings_s_path =  '../data/ml-latest-small/ratings.csv'
movie_names_path = "../data/ml-latest-small/movies.csv"

get_ratings_of_user(dataset, 0, path_movies_info=movie_names_path, data_path=ratings_s_path) 

Unnamed: 0,userId,movieTitle,rating,timestamp
144,0,"Few Good Men, A (1992)",4.0,964982989
230,0,Transformers: The Movie (1986),4.0,964982903
42,0,Independence Day (a.k.a. ID4) (1996),3.0,964984086
217,0,Hook (1991),4.0,964981775
145,0,Rush Hour (1998),4.0,964982310
...,...,...,...,...
148,0,I Still Know What You Did Last Summer (1998),2.0,964983546
125,0,"Honey, I Shrunk the Kids (1989)",4.0,964981747
219,0,Gladiator (2000),5.0,964980668
103,0,L.A. Confidential (1997),5.0,964982951


## Model fitting

In [20]:
k=30
lambda_ = 0.0005


mf_bias_reg_model = get_mf_bias_l2_reg_model(nb_users, nb_movies, k, lambda_)

early_stopping = EarlyStopping(monitor='val_mse', patience=5, verbose=1, restore_best_weights=True)

history = mf_bias_reg_model.fit(X_train, y_train, epochs=500, batch_size=512, validation_split=0.1, callbacks=[early_stopping])

mf_bias_reg_model.predict(X_test)

2023-12-14 13:11:03.534907: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4150 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1660, pci bus id: 0000:61:00.0, compute capability: 7.5


Epoch 1/500


2023-12-14 13:11:05.360823: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-12-14 13:11:05.886973: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f96bb5e4080 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-12-14 13:11:05.887018: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1660, Compute Capability 7.5
2023-12-14 13:11:05.902824: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-12-14 13:11:05.936075: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1702555866.162842   44018 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78/500
Epoch 7

array([[4.007229 ],
       [3.709824 ],
       [3.7847826],
       ...,
       [3.6980538],
       [2.8511043],
       [2.2512484]], dtype=float32)

## Model gridsearch

In [None]:
lambdas_ = [0.0002, 0.00005, 0.00002]
ks = [15, 30]

param_grid = dict(k=ks, lambda_=lambdas_)

best_params, best_score, best_model = grid_search(train, param_grid, get_mf_bias_l2_reg_model,
                                      nb_users, nb_movies, validation_size = 0.1)

print('Best hyper-parameters : ' + str(best_params))
print('Best validation RMSE : ' + str(best_score))

y_pred = best_model.predict(X_test)
test_rmse = sqrt(mean_squared_error(y_test, y_pred))

print("Best model test RMSE : %s " % test_rmse)

## Best model

In [51]:
X = [dataset["userId"].to_numpy(), dataset["movieId"].to_numpy()]
y = dataset["rating"].to_numpy()

best_params = {'k':30, 'lambda_':0.00005} #flemme de refaire tourner la gridsearch


best_model=get_mf_bias_l2_reg_model(nb_users, nb_movies, k =best_params['k'], lambda_ =best_params['lambda_'])

early_stopping = EarlyStopping(monitor='val_mse', patience=10, verbose=1, restore_best_weights=True)

best_model.fit(X, y, epochs=500, batch_size=512, validation_split=0.1, callbacks=[early_stopping])



Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.src.callbacks.History at 0x7f9696929d30>

### Best movies ratings for a user list

In [None]:
liste_users_selected = [0,5,10]

df= pd.DataFrame({
        "userId" : [],
        "best_movies_ids" : [],
        "best_movies_ratings" : []
    })
for user_id in liste_users_selected:
    five_best_movie_ids, five_best_ratings = get_top5_for_user(model = best_model, user_id=user_id, dataset = dataset)
    df_temp = pd.DataFrame({
        "userId" : np.repeat(user_id, repeats=5).astype(int),
        "best_movies_ids" : five_best_movie_ids,
        "best_movies_ratings" : five_best_ratings
    })
    df = pd.concat([df, df_temp], axis=0)

df.reset_index().drop(columns='index').assign(best_movies_ids = lambda df : df['best_movies_ids'].astype(int), 
                                              userId = lambda df : df['userId'].astype(int))

## Embbedings comprehension with PCA and t-SNE

In [52]:
#Récupérer les embeddings pour tous les users : 
users_embeddings = pd.DataFrame(best_model.get_layer(name="p_u__user_embedding").get_weights()[0])
users_embeddings #ligne/index=usersId, colonne=embedding 0, ..., 29

#Récupérer les embeddings pour tous les films : 
movies_embeddings = pd.DataFrame(best_model.get_layer(name="q_i__movie_embedding").get_weights()[0])
movies_embeddings #ligne/index = movieId, colonne=embedding 0, ..., 29

#Récupérer les noms des films associés aux id (old)
movie_names = pd.read_csv("../data/ml-latest-small/movies.csv")
movie_names_id_dict = dict(zip(movie_names['movieId'], movie_names['title'])) #{"old":"new"}

#Récupérer les genres des films associés aux id(old) ou aux titres ?

#scale dataset
scaler=StandardScaler()
movies_embeddings_scaled = scaler.fit_transform(movies_embeddings)

#compute PCA
pca = PCA()
pca_res = pca.fit_transform(movies_embeddings_scaled)

#PCA results
pca_res_2 = (pd.DataFrame(
    {"pc1" : pca_res[:,0],
     "pc2" : pca_res[:,1]})
     .reset_index(names='movieId')
     .assign(movieId2 = lambda df : df['movieId'])
     .replace(to_replace={'movieId2' : {v: k for k, v in movie_ids_map.items()}})
     .replace(to_replace={'movieId2' : movie_names_id_dict })
     .rename(columns={'movieId2' : 'movieTitle'})
)



#compute tsne
tsne = TSNE(n_components=2)
tsne_res = tsne.fit_transform(movies_embeddings_scaled)

#tsne results
tsne_res_2 = (pd.DataFrame(
    {"c1" : tsne_res[:,0],
     "c2" : tsne_res[:,1]})
     .reset_index(names='movieId')
     .assign(movieId2 = lambda df : df['movieId'])
     .replace(to_replace={'movieId2' : {v: k for k, v in movie_ids_map.items()}})
     .replace(to_replace={'movieId2' : movie_names_id_dict })
     .rename(columns={'movieId2' : 'movieTitle'})
)



### Display movies by genre

In [53]:
def get_column_genre_select(df:pd.DataFrame, genre_selected ) -> pd.DataFrame:
    #Récupérer les noms des films associés aux id (old)
    movie_names = pd.read_csv("../data/ml-latest-small/movies.csv")

    #Récupérer les genres des films associés aux id(old) ou aux titres ?
    movies_genres = (pd
            .concat(
                [
                    movie_names.drop(columns='genres'), 
                    movie_names.genres.str.split(pat='|', expand=True)
                ],
                axis=1
                    )
            
            )
    movies_genres = pd.melt(frame = movies_genres, id_vars=['movieId', 'title'], value_name='genres').drop(columns='variable').dropna()

    #sélectionner un genre et récupérer un df avec la liste des films de ce genre
    # genre_selected = 'Adventure'
    movies_with_select_genre = movies_genres.loc[movies_genres['genres']==genre_selected, ['title','genres']].rename(columns={'title':'movieTitle'})
    list_movie_titles_genre_selected = movies_with_select_genre['movieTitle'].tolist() 
    #créer une colonne dans pca_res_2 ou tsne_res_2 avec le nom du genre et mettre 'none' ou n'importe quoi pour les films qui ne sont pas dedans
    df_temp = df.assign(genre_select="Autres")
    df_temp.loc[df_temp['movieTitle'].isin(list_movie_titles_genre_selected), 'genre_select'] = genre_selected
    
    return df_temp

In [54]:
def display_movie_genres_repartition(selected_genre):
    df = pd.concat([pca_res_2, tsne_res_2.drop(columns=['movieId', 'movieTitle'])], axis=1)

    df = get_column_genre_select(df, selected_genre)

    color_map = {"Autres" : "#BBBBBB", selected_genre : "#AA3377"}
    df_color = df['genre_select'].map(color_map)

    opacity_map = {"Autres" : 0.4, selected_genre : 1}
    df_opacity = df['genre_select'].map(opacity_map)

    fig = make_subplots(rows=2, cols=1,
                        subplot_titles=("PCA","tSNE"))


    fig.append_trace(go.Scatter(
        x=df['pc1'],
        y=df['pc2'],
        mode='markers',
        hoverinfo='text',
        hovertext=df['movieTitle'],
        marker = dict(
            color=df_color,
            size=3,
            opacity=df_opacity
        )
    ), row=1, col=1)


    fig.append_trace(go.Scatter(
        x=df['c1'], 
        y=df['c2'], 
        mode='markers',
        hoverinfo='text',
        hovertext=df['movieTitle'],
        marker = dict(
            color=df_color,
            size=3,
            opacity=df_opacity
        )
    ), row=2, col=1)

    fig.update_layout(height=1000, width=600, 
                    title_text=f"Representation of movies from genre : {selected_genre}",
                    showlegend=False,
                    template='plotly_white',
                    
                    )
    fig.show()

In [55]:
display_movie_genres_repartition('Adventure')

In [None]:
#Récupérer les genres des films associés aux id(old) ou aux titres ?
movies_genres = (pd
            .concat(
                [
                    movie_names.drop(columns='genres'), 
                    movie_names.genres.str.split(pat='|', expand=True)
                ],
                axis=1
                    )
            
            )
movies_genres = pd.melt(frame = movies_genres, id_vars=['movieId', 'title'], value_name='genres').drop(columns='variable').dropna()

liste_genres = movies_genres.genres.unique()


# for genre in liste_genres : 
#     display_movie_genres_repartition(genre)