# Movies recommandation model

## 1. Imports

### 1.1 Libraries

In [1]:
# builtin
import os, time, sys, random

# data
import pandas as pd
import numpy as np
import requests
import math

# viz
import seaborn as sns
import matplotlib.pyplot as plt

# ML
from gensim.models import Word2Vec
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# other
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
from pandarallel import pandarallel
import joblib

### 1.2 Download and options

In [2]:
sns.set()

### 1.3 Loading data

In [6]:
# CSV

# --------------- Env Perso ---------------
#df = pd.read_csv(r"C:\Users\derou\OneDrive\Bureau\DATA\PORTFOLIO\Recommandation de films\df_movies_preprocess.csv")
#df_genre = (r"C:\Users\derou\OneDrive\Bureau\DATA\PORTFOLIO\Recommandation de films\genres_binarized.csv")

# --------------- Env Vinci ---------------
df = pd.read_csv(r"C:\Users\melvin.derouk\Desktop\Data formation\Movies-Recommandations\df_movies_preprocess.csv")
genre_df = pd.read_csv(r"C:\Users\melvin.derouk\Desktop\Data formation\Movies-Recommandations\genres_binarized.csv")

## 2. Featurization

In [7]:
features = df[['Titre', 'Age du film', 'clean_synopsis_str', 'Note'] + list(genre_df)]

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('Titre', TfidfVectorizer(), ['Titre']),
        ('clean_synopsis', TfidfVectorizer(), 'clean_synopsis_str'),
        ('Note', MinMaxScaler(), ['Note'])
    ],
    remainder='passthrough' 
)

In [9]:
titre_tfidf = TfidfVectorizer().fit_transform(df['Titre']) * 1.5

synopsis_tfidf = TfidfVectorizer().fit_transform(df['clean_synopsis_str']) 

note_scaled = MinMaxScaler().fit_transform(df[['Note']]) * 0.5

genre_columns = ['Action', 'Animation', 'Aventure', 'Comédie', 'Crime', 'Documentaire', 'Drame', 'Familial', 'Fantastique', 'Guerre', 'Histoire', 'Horreur', 'Musique', 'Mystère', 'Romance', 'Science-Fiction', 'Thriller', 'Téléfilm', 'Western']
genre_matrix = df[genre_columns].values

In [10]:
features_vectorized = hstack([titre_tfidf, synopsis_tfidf, note_scaled, genre_matrix])

In [11]:
cosine_sim = cosine_similarity(features_vectorized)

In [12]:
features_vectorized

<8893x20783 sparse matrix of type '<class 'numpy.float64'>'
	with 370673 stored elements in COOrdinate format>

## 3. Recommandations function

In [15]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = df.index[df['Titre'] == title].tolist()[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # 10 films en recommandations
    movie_indices = [i[0] for i in sim_scores]
    movie_scores = [i[1] for i in sim_scores]
    recommendations = df['Titre'].iloc[movie_indices]
    return recommendations, movie_scores

# Obtenir des recommandations pour un film spécifique
recommendations, scores = get_recommendations("300", cosine_sim)
for title, score in zip(recommendations, scores):
    print(f"{title}: {score.round(2)}")

The Last Kingdom : Sept rois doivent mourir: 0.5
Kingdom 2 : En terre lointaine: 0.49
Les Canons de Navarone: 0.49
Eastern Condors: 0.49
Les Douze salopards: 0.49
Quand les aigles attaquent: 0.49
Troie: 0.47
Rambo III: 0.47
Rambo II : La Mission: 0.46
Kingdom: 0.46
