# Movies recommandation model

## 1. Imports

### 1.1 Libraries

In [1]:
# builtin
import os, time, sys, random

# data
import pandas as pd
import numpy as np
import requests
import math

# viz
import seaborn as sns
import matplotlib.pyplot as plt

# ML
from gensim.models import Word2Vec
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# other
import warnings
warnings.filterwarnings("ignore")

### 1.2 Download and options

In [2]:
sns.set()

### 1.3 Loading data

In [3]:
# CSV

#Env Perso
#df = pd.read_csv(r"C:\Users\derou\OneDrive\Bureau\DATA\PORTFOLIO\Recommandation de films\df_movies_txt_cleaned.csv")

# Env Vinci
df = pd.read_csv(r"C:\Users\melvin.derouk\Desktop\Data formation\Movies-Recommandations\df_movies_txt_cleaned.csv")

## 2. Modèle de recommandation

### 5.1 Bag of words

In [None]:
#Bag of words
model = Word2Vec(df['clean_synopsis'], vector_size=300, window=5, min_count=1, sg=1)

In [None]:
model.train(df['clean_synopsis'], total_examples=len(df), epochs=10)

(3265098, 3372070)

In [None]:
similar_words = model.wv.most_similar("")
similar_words

[('scream', 0.929614245891571),
 ('squar', 0.9047838449478149),
 ('décrit', 0.8986225128173828),
 ('icon', 0.8969197869300842),
 ('préquel', 0.8961989283561707),
 ('classif', 0.8959721326828003),
 ('théâtral', 0.8935500979423523),
 ('oscaris', 0.8910315036773682),
 ('compil', 0.8889420628547668),
 ('encyclopedi', 0.8863794207572937)]

### 5.2 Vectorisation 

In [None]:
df['clean_synopsis_str'] = df['clean_synopsis'].apply(lambda x: ' '.join(x))

In [None]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['clean_synopsis_str'])

In [None]:
#count = CountVectorizer()
#count_matrix = count.fit_transform(df['clean_synopsis_str'])

In [None]:
# Calcul de la similarité cosinus
cosine_sim = cosine_similarity(count_matrix, count_matrix)
#cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
indices = pd.Series(df.index, index=df['Titre']).to_dict()

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Obtenez l'index du film qui correspond au titre
    idx = df.index[df['Titre'] == title].tolist()[0]

    # Obtenez les scores de similarité pour ce film par rapport à tous les autres
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Triez les films en fonction des scores de similarité
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Obtenez les scores des 10 films les plus similaires
    sim_scores = sim_scores[1:11]

    # Obtenez les indices de films
    movie_indices = [i[0] for i in sim_scores]

    # Retournez les titres des 10 films les plus similaires
    return df['Titre'].iloc[movie_indices]

In [None]:
# Obtenir des recommandations pour un film donné
recommendations = get_recommendations("Coco")
print(recommendations)

1267                                     Pinocchio
8426                              Le Dernier Wagon
383                                       Parasite
3666                 À la Poursuite de Ricky Baker
3002    L'enfant, la taupe, le renard et le cheval
4906                The Boyfriend : Pourquoi lui ?
2858                               Armageddon Time
1425                   Je veux manger ton pancréas
4013                          Jun, la voix du cœur
7619                                Le Ruban blanc
Name: Titre, dtype: object


### Vectorisation binaires des genres

In [None]:
# Vectorisation binaires des genres
print(len(df))
print(df['Genre'].apply(lambda x: isinstance(x, str)).sum())

In [None]:
df['Genre'] = df['Genre'].apply(lambda x: eval(x) if isinstance(x, str) else x)
print(df['Genre'].apply(lambda x: isinstance(x, list)).sum())

In [None]:
mlb = MultiLabelBinarizer()
genre_binarized = mlb.fit_transform(df['Genre'])

# Créer un DataFrame avec les résultats
genre_df = pd.DataFrame(genre_binarized, columns=mlb.classes_)

In [None]:
genre_df.index = df.index

In [None]:
df = pd.concat([df.drop('Genre', axis=1), genre_df], axis=1)
df