In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.impute import SimpleImputer, KNNImputer
import numpy as np

# chargement jeu de donnée avec le moins de features
df = pd.read_csv('movies_clean_v2.csv')
df.head(20)


Unnamed: 0,acteurs,budget,compositeur,date,entrees_premiere_semaine,franchise,genre,pays,producteur,realisateur,remake,studio,titre,season,coeff_studio,scoring_acteurs,scoring_acteurs&realisateur
0,"Sam Worthington, Sigourney Weaver",315000000,James Horner,2009-12-16,0,Franchise,Science Fiction,Etats-Unis,,James Cameron,,20th Century Fox,Avatar,Hiver,3,0.0,0.275862
1,"Daniel Radcliffe, Emma Watson, Gary Oldman, Al...",130000000,John Williams,2004-06-02,0,Franchise,Fantasy,Etats-Unis,,Alfonso Cuaron,,Warner Bros.,Harry Potter et le prisonnier d'Azkaban,Été,3,0.724138,0.724138
2,"Adam Driver, Benicio Del Toro, Joseph Gordon-L...",200000000,John Williams,2017-12-13,0,Franchise,Fantasy,Etats-Unis,Kathleen Kennedy,Rian Johnson,,Walt Disney Pictures,Star Wars: Les derniers Jedi,Hiver,3,0.0,0.0
3,"Chris Evans, Chris Hemsworth, Josh Brolin, Rob...",295000000,Alan Silvestri,2018-04-25,0,Franchise,Comicbook,Etats-Unis,Kevin Feige,Russo (brothers),,Walt Disney Pictures,Avengers: Infinity War,Printemps,3,0.689655,0.689655
4,"Sam Worthington, Sigourney Weaver, Kate Winslet",350000000,,2022-12-14,0,Franchise,Science Fiction,Etats-Unis,,James Cameron,,Walt Disney Pictures,Avatar : la voie de l'eau,Hiver,3,0.448276,0.724138
5,"Orlando Bloom, Viggo Mortensen, Ian McKellen, ...",94000000,Howard Shore,2003-12-17,0,Franchise,Fantasy,Etats-Unis,,Peter Jackson,,Metropolitan,Le Seigneur des anneaux: Le Retour du roi,Hiver,0,1.034483,1.034483
6,"Richard Anconina, José Garcia, Gad Elmaleh",12560000,,2001-02-07,0,Franchise,Comédie,France,,Thomas Gilou,,Warner Bros.,La Vérité si je mens! 2,Hiver,3,0.034483,0.034483
7,"Daniel Radcliffe, Emma Watson, Kenneth Branagh...",100000000,John Williams,2002-12-04,0,Franchise,Fantasy,Etats-Unis,,Chris Columbus,,Warner Bros.,Harry Potter et la chambre des secrets,Hiver,3,0.551724,0.551724
8,"Dany Boon, Kad Merad, Alice Pol, Valérie Bonne...",31680587,,2014-02-26,0,,Comédie,France,,Dany Boon,,Pathé,Supercondriaque,Hiver,2,0.344828,0.551724
9,"Vin Diesel, Dwayne Johnson, Paul Walker, Luke ...",160000000,Brian Tyler,2013-05-22,0,Franchise,Aventure - Action,Etats-Unis,Neal H. Moritz,Justin Lin,,Universal,Fast & Furious 6,Printemps,3,0.172414,0.172414


In [4]:
df_ml = df.drop(['acteurs', 'realisateur', 'studio'], axis=1)
df_ml

Unnamed: 0,budget,compositeur,date,entrees_premiere_semaine,franchise,genre,pays,producteur,remake,titre,season,coeff_studio,scoring_acteurs,scoring_acteurs&realisateur
0,315000000,James Horner,2009-12-16,0,Franchise,Science Fiction,Etats-Unis,,,Avatar,Hiver,3,0.000000,0.275862
1,130000000,John Williams,2004-06-02,0,Franchise,Fantasy,Etats-Unis,,,Harry Potter et le prisonnier d'Azkaban,Été,3,0.724138,0.724138
2,200000000,John Williams,2017-12-13,0,Franchise,Fantasy,Etats-Unis,Kathleen Kennedy,,Star Wars: Les derniers Jedi,Hiver,3,0.000000,0.000000
3,295000000,Alan Silvestri,2018-04-25,0,Franchise,Comicbook,Etats-Unis,Kevin Feige,,Avengers: Infinity War,Printemps,3,0.689655,0.689655
4,350000000,,2022-12-14,0,Franchise,Science Fiction,Etats-Unis,,,Avatar : la voie de l'eau,Hiver,3,0.448276,0.724138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4536,40000000,,2011-04-06,0,,Aventure - Action,Etats-Unis,,Remake,Le Flingueur (2011),Printemps,0,0.034483,0.034483
4537,49515804,,2006-10-04,0,,Drame,Allemagne,,,Le Parfum : histoire d'un meurtrier,Automne,0,0.000000,0.000000
4538,27800000,,2018-02-28,0,,Comédie,France,,,La Ch’tite famille,Hiver,2,0.310345,0.517241
4539,2700000,,2020-07-01,0,,Comédie,France,,,Les Parfums,Été,0,0.000000,0.000000


In [5]:
df_ml['franchise_binary'] = df_ml['franchise'].apply(lambda x: 1 if x == 'Franchise' else 0 if pd.isna(x) else np.nan)
df_ml['remake_binary'] = df_ml['remake'].apply(lambda x: 1 if x == 'Remake' else 0 if pd.isna(x) else np.nan)
df_ml = df_ml.drop(['franchise', 'remake'], axis=1)
df_ml



Unnamed: 0,budget,compositeur,date,entrees_premiere_semaine,genre,pays,producteur,titre,season,coeff_studio,scoring_acteurs,scoring_acteurs&realisateur,franchise_binary,remake_binary
0,315000000,James Horner,2009-12-16,0,Science Fiction,Etats-Unis,,Avatar,Hiver,3,0.000000,0.275862,1,0
1,130000000,John Williams,2004-06-02,0,Fantasy,Etats-Unis,,Harry Potter et le prisonnier d'Azkaban,Été,3,0.724138,0.724138,1,0
2,200000000,John Williams,2017-12-13,0,Fantasy,Etats-Unis,Kathleen Kennedy,Star Wars: Les derniers Jedi,Hiver,3,0.000000,0.000000,1,0
3,295000000,Alan Silvestri,2018-04-25,0,Comicbook,Etats-Unis,Kevin Feige,Avengers: Infinity War,Printemps,3,0.689655,0.689655,1,0
4,350000000,,2022-12-14,0,Science Fiction,Etats-Unis,,Avatar : la voie de l'eau,Hiver,3,0.448276,0.724138,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4536,40000000,,2011-04-06,0,Aventure - Action,Etats-Unis,,Le Flingueur (2011),Printemps,0,0.034483,0.034483,0,1
4537,49515804,,2006-10-04,0,Drame,Allemagne,,Le Parfum : histoire d'un meurtrier,Automne,0,0.000000,0.000000,0,0
4538,27800000,,2018-02-28,0,Comédie,France,,La Ch’tite famille,Hiver,2,0.310345,0.517241,0,0
4539,2700000,,2020-07-01,0,Comédie,France,,Les Parfums,Été,0,0.000000,0.000000,0,0
