# Système de recommandation

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
from tqdm import tqdm

In [2]:
tqdm.pandas()

In [3]:
df_titre = pd.read_csv("df_film_filtre.csv").drop(columns = "Unnamed: 0")
df = df_titre.drop(columns = ["poster_path", "titre"])

## Séparer les colonnes numériques et catégorielles

In [4]:
col_num = df.select_dtypes('number').columns
df["genre"] = df["genre"].apply(eval) #Transformer la string en liste 
df["production_countries"] = df["production_countries"].apply(eval)

# Pipeline

Pour vectoriser chaque genre et ne plus tenir compte de l'ordre

In [5]:
multilabelb = MultiLabelBinarizer().fit(df["genre"])
features_cat = multilabelb.transform(df["genre"])
df_cat = pd.DataFrame(features_cat, columns = multilabelb.classes_)

In [6]:
multilabelb_pays = MultiLabelBinarizer().fit(df["production_countries"])
features_cat_pays = multilabelb_pays.transform(df["production_countries"])
df_cat_pays = pd.DataFrame(features_cat_pays, columns = multilabelb_pays.classes_)

In [8]:
col_genre = list(multilabelb.classes_)
col_pays = list(multilabelb_pays.classes_)

In [10]:
df_titre = df_titre.join(df_cat).drop(columns = 'genre').join(df_cat_pays).drop(columns = 'production_countries')

C'est l'étape qui va transformer les colonnes numériques (standardisation etc)

In [11]:
numeric_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "mean")), #Remplace les valeurs manquantes
    ("scaling", StandardScaler()) #Standardise 
])

In [13]:
#processeur
preprocessor = ColumnTransformer(
    transformers= [("num", numeric_transformer, col_num),
                   ("unchanged", 'passthrough', list(col_genre) + list(col_pays))
                   ]
)

In [17]:
preprocessor.fit(df)

In [19]:
data = pd.DataFrame(preprocessor.transform(df_titre))

## Ajouter un poids aux distances

In [21]:
col_names = preprocessor.get_feature_names_out()

In [22]:
data.columns = col_names

### Cat

In [23]:
col_real = [real for real in col_names if "Real" in real]

In [24]:
col_genre = [f"unchanged__{genre}" for genre in col_genre]

### Dico

Dictionnaire pour attribuer à chaque colonne, un nouveau poids. 

In [25]:
col_genre_princ = ['unchanged__Animation']

In [26]:
dico_weight = {
    "pop":(["num__popularity"],0.7),  
    "nb_votes":(["num__nb_votes"],0.6), 
    "note":(["num__note"], 0.8),       
    "duree":(["num__temps_minutes"], 0.5),               
    "genre": (col_genre, 2),                 
    "genresPrincipaux": (col_genre_princ, 1.3) 
    }

In [27]:
for nom_f, (liste_cols, pond) in tqdm(dico_weight.items()):
    # Applique le poids uniquement aux colonnes spécifiées
    for col in tqdm(liste_cols):
        data[col] *= pond

100%|██████████| 1/1 [00:00<00:00, 211.78it/s]
100%|██████████| 1/1 [00:00<00:00, 466.86it/s]
100%|██████████| 1/1 [00:00<00:00, 703.03it/s]
100%|██████████| 1/1 [00:00<00:00, 508.59it/s]
100%|██████████| 26/26 [00:00<00:00, 864.05it/s]
100%|██████████| 1/1 [00:00<00:00, 547.49it/s]
100%|██████████| 6/6 [00:00<00:00, 85.33it/s]


### Attribuer une distance

In [31]:
knn = NearestNeighbors(n_neighbors= 6)

In [32]:
knn.fit(data)

In [34]:
dis,index = knn.kneighbors(data)

In [42]:
reco_no_real = pd.DataFrame(index,columns=["source","r1","r2","r3","r4","r5"], index=df_titre["titre"])

Unnamed: 0_level_0,source,r1,r2,r3,r4,r5
titre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
The Empress,0,913,1132,1143,538,58206
The Great Adventure,1,249,1660,287,1921,866
The Corbett-Fitzsimmons Fight,2,77025,8078,62404,4828,10591
The Glorious Adventure,3,2120,6285,1080,18006,16742
The Film Parade,4,2012,61934,58795,759,77365
...,...,...,...,...,...,...
Chubby Café,107799,97175,107517,88985,52465,62076
"2025: Blood, White & Blue",107800,82747,82758,94269,82756,93501
Rosa pietra stella,107801,70693,93953,93517,94504,89827
The Scrapper,107802,81885,99432,80270,104164,84286
