In [11]:
import pandas as pd 

df = pd.read_csv('df_model_sans_meteo2.csv',index_col=0 )

In [12]:
from sklearn.model_selection import train_test_split

X = df.drop(['box_office_first_week'],axis=1)
y = df['box_office_first_week']
X_train,X_test, y_train,y_test = train_test_split(X,y, train_size=0.9,shuffle=True, random_state=42)

In [13]:
X.columns

Index(['duration', 'nationality', 'views', 'budget', 'season', 'is_holiday',
       'proportion_stars_actors', 'proportion_stars_producers',
       'proportion_stars_director', 'distributor_avg_frequency',
       'genre_action', 'genre_animation', 'genre_arts martiaux',
       'genre_aventure', 'genre_biopic', 'genre_bollywood', 'genre_comédie',
       'genre_comédie dramatique', 'genre_comédie musicale', 'genre_divers',
       'genre_drame', 'genre_epouvante-horreur', 'genre_erotique',
       'genre_espionnage', 'genre_expérimental', 'genre_famille',
       'genre_fantastique', 'genre_guerre', 'genre_historique',
       'genre_judiciaire', 'genre_musical', 'genre_policier', 'genre_péplum',
       'genre_romance', 'genre_science fiction', 'genre_sport event',
       'genre_thriller', 'genre_western'],
      dtype='object')

In [17]:
# Importer les bibliothèques nécessaires
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score, mean_squared_error
import optuna
from optuna.samplers import TPESampler
import pandas as pd
import numpy as np

# Définir les paramètres du préprocesseur
numerical_features = ['duration', 'views', 'budget','proportion_stars_actors', 'proportion_stars_producers', 'proportion_stars_director', 'distributor_avg_frequency']
categorical_features = ['nationality', 'season', 'is_holiday']

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder="passthrough"
)

# Définir la fonction objectif pour Optuna
def objective(trial):
    # Définir les hyperparamètres à optimiser
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 200, 300]),
        'criterion': trial.suggest_categorical('criterion', ['poisson', 'friedman_mse', 'squared_error', 'absolute_error']),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42
    }

    # Créer le modèle ExtraTreesRegressor avec les hyperparamètres
    model = ExtraTreesRegressor(**params)

    # Créer le pipeline avec le préprocesseur et le modèle
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Entraîner le modèle
    pipeline.fit(X_train, y_train)

    # Faire des prédictions sur les données de validation
    y_pred = pipeline.predict(X_test)

    # Calculer la métrique d'évaluation (par exemple, le R2 score)
    score = r2_score(y_test, y_pred)

    return score

# Créer l'étude Optuna
sampler = TPESampler(seed=42)  # Make the sampler behave in a deterministic way.
study = optuna.create_study(sampler=sampler, direction='maximize')

# Lancer l'optimisation des hyperparamètres
study.optimize(objective, n_trials=100)

# Obtenir les meilleurs hyperparamètres trouvés
best_params = study.best_params

# Créer le pipeline final avec les meilleurs hyperparamètres
best_model = ExtraTreesRegressor(**best_params)

pipeline_final = Pipeline([
    ('preprocessor', preprocessor),
    ('model', best_model)
])

# Entraîner le modèle final sur l'ensemble des données d'entraînement
pipeline_final.fit(X_train, y_train)

# Faire des prédictions sur de nouvelles données
y_pred = pipeline_final.predict(X_test)

# Calculer la métrique d'évaluation finale (par exemple, le R2 score)
final_r2 = r2_score(y_test, y_pred)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Meilleurs hyperparamètres trouvés :", best_params)
print("R2 Score sur les données de test :", final_r2)
print(final_rmse)



[I 2023-07-31 11:14:03,090] A new study created in memory with name: no-name-bb155890-f1f8-44ac-84f5-76603077363c
[I 2023-07-31 11:14:03,777] Trial 0 finished with value: 0.35414310816754224 and parameters: {'n_estimators': 200, 'criterion': 'poisson', 'max_depth': 14, 'min_samples_split': 13, 'min_samples_leaf': 8, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 0.35414310816754224.
[I 2023-07-31 11:14:04,566] Trial 1 finished with value: 0.38471698312233193 and parameters: {'n_estimators': 300, 'criterion': 'squared_error', 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 1 with value: 0.38471698312233193.
[I 2023-07-31 11:14:04,787] Trial 2 finished with value: 0.24798014233678967 and parameters: {'n_estimators': 100, 'criterion': 'friedman_mse', 'max_depth': 4, 'min_samples_split': 15, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True}. Best is trial 1 with value: 0.384716

In [17]:
# Importer les bibliothèques nécessaires
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score, mean_squared_error
import optuna
from optuna.samplers import TPESampler
import pandas as pd
import numpy as np

# Définir les paramètres du préprocesseur
numerical_features = ['duration', 'views', 'budget','proportion_stars_actors', 'proportion_stars_producers', 'proportion_stars_director', 'distributor_avg_frequency']
categorical_features = ['nationality', 'season', 'is_holiday']

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder="passthrough"
)

best_params = {'n_estimators': 100, 'criterion': 'squared_error', 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': None, 'bootstrap': False}
# Créer le modèle avec les meilleurs hyperparamètres
model = ExtraTreesRegressor(
    n_estimators=best_params['n_estimators'],
    criterion=best_params['criterion'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    bootstrap=best_params['bootstrap']
)

pipe = Pipeline([
    ('preprocessor',preprocessor),
    ('model', model)
])



In [20]:
import mlflow

experiment_id = mlflow.set_experiment("cinema").experiment_id

run_name = "extratree_std_onehot"

with mlflow.start_run(experiment_id=experiment_id, run_name=run_name) as run:
    # Log the baseline model to MLflow
    pipe.fit(X_train, y_train)
    
    
    mlflow.sklearn.log_model(pipe, run_name)

    
    model_uri = mlflow.get_artifact_uri(run_name)

    # Log des paramètres
    mlflow.log_params({'n_estimators': 100,'criterion': 'squared_error','max_depth': 14, 'min_samples_split': 3,'min_samples_leaf': 3,
    'max_features': None,
    'bootstrap': False})
    
    
    # Log des Tags
    mlflow.set_tag("model","extra_tree")
    mlflow.set_tag("scaler","StandardScaler")
    mlflow.set_tag("encoder",'OneHotEncoder')
    mlflow.set_tag('df','sans_meteo')
    mlflow.set_tag('budget','numeric')

    eval_data = X_test
    eval_data["label"] = y_test

    # Evaluate the logged model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="regressor",
        evaluators=["default"],
   )

2023/07/31 11:31:39 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.


In [8]:
import pickle 
with open('modele.pkl', 'wb') as fichier_modele:
    pickle.dump(pipe, fichier_modele)
