In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
import xgboost
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
import pandas as pd
df = pd.read_csv('movies_clean_v3.csv')

df = df.drop(['acteurs', 'scoring_acteurs', 'realisateur', 'studio'], axis=1)

# Gestion des valeurs manquantes pour les colonnes numériques et catégorielles
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.drop('entrees_premiere_semaine')
numeric_transformer = SimpleImputer(strategy='median')

categorical_features = df.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Préparation du préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Modèle de régression
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', CatBoostRegressor())])

# Séparation des données
X = df.drop('entrees_premiere_semaine', axis=1)
y = df['entrees_premiere_semaine']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Entraînement du modèle
model.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluation du modèle
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mae
r2 = r2_score(y_test, y_pred)
print(r2)
print(rmse)


Learning rate set to 0.050198
0:	learn: 415309.6729666	total: 68.9ms	remaining: 1m 8s
1:	learn: 407273.4358949	total: 87.2ms	remaining: 43.5s
2:	learn: 398730.4552154	total: 105ms	remaining: 34.9s
3:	learn: 391955.9875126	total: 118ms	remaining: 29.5s
4:	learn: 385210.0539889	total: 129ms	remaining: 25.6s
5:	learn: 379194.7082572	total: 139ms	remaining: 23s
6:	learn: 373603.5964207	total: 152ms	remaining: 21.6s
7:	learn: 367797.1687815	total: 162ms	remaining: 20.1s
8:	learn: 362398.2171638	total: 172ms	remaining: 18.9s
9:	learn: 358237.3166352	total: 182ms	remaining: 18.1s
10:	learn: 353674.7103892	total: 193ms	remaining: 17.4s
11:	learn: 349820.9572467	total: 204ms	remaining: 16.8s
12:	learn: 345939.2238673	total: 214ms	remaining: 16.3s
13:	learn: 341941.3846526	total: 225ms	remaining: 15.8s
14:	learn: 338200.9127459	total: 235ms	remaining: 15.4s
15:	learn: 335200.6535141	total: 245ms	remaining: 15s
16:	learn: 331519.8350518	total: 255ms	remaining: 14.8s
17:	learn: 329126.8822712	tota

In [12]:
X_train.tail(10)

Unnamed: 0,budget,compositeur,date,franchise,genre,pays,producteur,remake,titre,season,coeff_studio,scoring_acteurs&realisateur
2135,28000000,,2007-09-12,,Romance,Etats-Unis,,Remake,Le Goût de la vie,Automne,3,0.0
2599,5290000,,2004-11-24,,Drame,France,,,Holy Lola,Automne,2,0.0
705,15570,,2022-09-14,,Drame,France,,,A propos de Joan,Automne,2,0.62069
3468,3986834,,2013-10-09,,Comédie dramatique,France,,,La Vie d'Adèle,Automne,0,0.310345
4373,185000000,,2022-03-02,,Comicbook,Etats-Unis,,,The Batman,Printemps,3,0.206897
1033,0,,2022-01-26,,Drame,Belgique,,,Un monde,Hiver,0,0.0
3264,14000000,,2006-01-18,,Drame,Etats-Unis,,,Le Secret de Brokeback Mountain,Hiver,2,0.172414
1653,20000000,,2010-10-06,,Horreur,Etats-Unis,,,Laisse-moi entrer,Automne,0,0.034483
2607,40000000,,2005-08-10,,Comédie,Etats-Unis,,,Serial noceurs,Été,0,0.068966
2732,16500000,,1996-02-28,,Drame,Etats-Unis,,,Raison et sentiments,Hiver,3,0.62069


In [5]:
import pickle

pickle.dump(model, open('model.pkl', 'wb'))