# Système de recommandation

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
df_titre = pd.read_csv("df_film_filtre.csv").drop(columns = "Unnamed: 0")

In [3]:
df = pd.read_csv("df_film_filtre.csv").drop(columns = ["Unnamed: 0", "poster_path", "titre"])

In [4]:
df["genre"] = df["genre"].apply(eval)

In [5]:
df["production_countries"] = df["production_countries"].apply(eval)

## Séparer les colonnes num et cat

Non supervié

In [6]:
multilabelb = MultiLabelBinarizer().fit(df["genre"])
features_cat = multilabelb.transform(df["genre"])
df_cat = pd.DataFrame(features_cat, columns = multilabelb.classes_)

In [7]:
multilabelb_pays = MultiLabelBinarizer().fit(df["production_countries"])
features_cat_pays = multilabelb_pays.transform(df["production_countries"])
df_cat_pays = pd.DataFrame(features_cat_pays, columns = multilabelb_pays.classes_)

In [8]:
col_genre = multilabelb.classes_
col_pays = multilabelb_pays.classes_

In [9]:
df = df.join(df_cat).drop(columns = 'genre')
df = df.join(df_cat_pays).drop(columns = 'production_countries')

In [10]:
df_titre = df_titre.join(df_cat).drop(columns = 'genre')
df_titre = df_titre.join(df_cat_pays).drop(columns = 'production_countries')

In [18]:
col_num = df.select_dtypes('number').columns
col_cat = ["Real"]

C'est l'étape qui va transformer les colonnes numériques (standardisation etc)

In [19]:
numeric_transformer = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "mean")), #Remplace les valeurs manquantes
    ("scaling", StandardScaler()) #Standardise 
])

In [20]:
categorical_transformer =  Pipeline(steps = [
    ("encoder", OneHotEncoder(handle_unknown= "ignore"))
])

On an cat et num, le porcesseur réunit les 2 

In [21]:
#processeur
preprocessor = ColumnTransformer(
    transformers= [("num", numeric_transformer, col_num),
                   ("cat", categorical_transformer, col_cat),
                   ("unchanged", 'passthrough', list(col_genre) + list(col_pays))
                   ]
)

In [22]:
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    ("model", NearestNeighbors(n_neighbors= 6, metric = 'euclidean')) # Le modèle qu'on veut entraîner 
]
)

In [23]:
pipeline.fit(df)

In [24]:
df

Unnamed: 0,année,temps_minutes,note,nb_votes,Real,popularity,Unnamed: 7,Action,Adventure,Animation,...,WS,XC,XG,XI,XK,YE,YU,ZA,ZM,ZW
0,1917,50.000000,5.1,26,Alice Guy,1.341,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1918,50.000000,6.6,24,Alice Guy,0.840,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1897,100.000000,5.2,538,Enoch J. Rector,0.958,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1922,70.000000,6.6,49,J. Stuart Blackton,0.841,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1933,55.000000,7.2,46,J. Stuart Blackton,0.872,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107799,2019,94.787143,6.2,10,Qia Li,0.883,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
107800,2022,135.000000,2.2,235,Josh E. Williams,3.534,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
107801,2020,94.000000,6.1,129,Marcello Sannino,1.218,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
107802,2021,87.000000,4.3,1464,Bari Kang,3.510,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
data = pd.DataFrame(pipeline.named_steps["preprocessor"].transform(df_titre.iloc[[33724]]).toarray())

## Ajouter un poids aux distances

In [26]:
col_names = pipeline.named_steps["preprocessor"].get_feature_names_out()

In [27]:
data.columns = col_names

### Cat

In [28]:
col_real = [real for real in col_names if "Real" in real]
col_genre = [genre for genre in col_names if "genre" in genre]

### Dico

Dictionnaire pour attribuer à chaque colonne, un nouveau poids. 

In [29]:
dico_weight = {
    "pop":(["num__popularity"],1.8),
    "nb_votes":(["num__nb_votes"],2),
    "note":(["num__note"], 1.5),
    "duree":(["num__temps_minutes"], 2),
    "Real":(col_real, 0.5),
    "genre": (col_genre, 0.2)
    }
for nom_f, (liste_cols, pond) in dico_weight.items():
    data[liste_cols] *= pond

### Attribuer une distance

In [33]:
data = pd.DataFrame(pipeline.named_steps["preprocessor"].transform(df_titre).toarray())

In [34]:
dis,index = pipeline.named_steps["model"].kneighbors(data)

In [40]:
pd.DataFrame(index,columns=["source","r1","r2","r3","r4","r5"], index=df_titre["titre"]).to_csv("resultat.csv")

In [44]:
df_titre[["titre", "année", "Real", "poster_path"]].to_csv("df_reco.csv")

Faire un dictionnaire avec {titre:posterpath}


In [35]:
for titre in index[0]:
    display((df_titre.iloc[[titre]]))

Unnamed: 0,titre,année,temps_minutes,note,nb_votes,Real,popularity,poster_path,Unnamed: 9,Action,...,WS,XC,XG,XI,XK,YE,YU,ZA,ZM,ZW
0,The Empress,1917,50.0,5.1,26,Alice Guy,1.341,/5kqaWShpShGdE49DNEOgLESjVsQ.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,titre,année,temps_minutes,note,nb_votes,Real,popularity,poster_path,Unnamed: 9,Action,...,WS,XC,XG,XI,XK,YE,YU,ZA,ZM,ZW
1,The Great Adventure,1918,50.0,6.6,24,Alice Guy,0.84,/aPHkaRR2ggfYe6LqeQfSk0QRpgL.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,titre,année,temps_minutes,note,nb_votes,Real,popularity,poster_path,Unnamed: 9,Action,...,WS,XC,XG,XI,XK,YE,YU,ZA,ZM,ZW
913,Pillars of Society,1916,50.0,5.1,29,Raoul Walsh,0.6,/1NiMl5yfmSPCGcF8w8j3UXjPlX9.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,titre,année,temps_minutes,note,nb_votes,Real,popularity,poster_path,Unnamed: 9,Action,...,WS,XC,XG,XI,XK,YE,YU,ZA,ZM,ZW
1132,Redeeming Love,1916,50.0,5.0,26,William Desmond Taylor,0.805,/9x5dAeJL9LgRkHdw2XnXh75o7IL.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,titre,année,temps_minutes,note,nb_votes,Real,popularity,poster_path,Unnamed: 9,Action,...,WS,XC,XG,XI,XK,YE,YU,ZA,ZM,ZW
1143,The Wood Nymph,1916,50.0,5.0,14,Paul Powell,0.627,,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,titre,année,temps_minutes,note,nb_votes,Real,popularity,poster_path,Unnamed: 9,Action,...,WS,XC,XG,XI,XK,YE,YU,ZA,ZM,ZW
538,The Temple of Dusk,1918,50.0,5.2,14,James Young,0.6,/14W2F8BsXmqVW6S5XCwjZoQgnlR.jpg,0,0,...,0,0,0,0,0,0,0,0,0,0


## Matrice