In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
import xgboost
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
import pandas as pd
df = pd.read_csv('movies_clean_v3.csv')

df = df.drop(['acteurs', 'scoring_acteurs', 'realisateur', 'studio'], axis=1)

# Gestion des valeurs manquantes pour les colonnes numériques et catégorielles
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.drop('entrees_premiere_semaine')
numeric_transformer = SimpleImputer(strategy='median')

categorical_features = df.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Préparation du préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Modèle de régression
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', CatBoostRegressor())])

# Séparation des données
X = df.drop('entrees_premiere_semaine', axis=1)
y = df['entrees_premiere_semaine']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Entraînement du modèle
model.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluation du modèle
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mae
r2 = r2_score(y_test, y_pred)
print(r2)
print(rmse)


ImportError: cannot import name 'root_mean_squared_error' from 'sklearn.metrics' (/home/utilisateur/anaconda3/lib/python3.9/site-packages/sklearn/metrics/__init__.py)

In [1]:
X_train.tail(10)
X_train.dtypes
#X_train.coeff_studio.unique()

NameError: name 'X_train' is not defined

In [5]:
import pickle

pickle.dump(model, open('model.pkl', 'wb'))