In [9]:
import pandas as pd 

df = pd.read_csv('df_model.csv',index_col=0 )

In [10]:
from sklearn.model_selection import train_test_split

X = df.drop(['box_office_first_week','genre','Lille', 'Bordeaux', 'Lyon', 'Paris'],axis=1)
y = df['box_office_first_week']
X_train,X_test, y_train,y_test = train_test_split(X,y, train_size=0.9,shuffle=True, random_state=42)

In [11]:
X.columns

Index(['duration', 'nationality', 'press_eval', 'viewers_eval', 'views',
       'budget', 'Temperature Moyenne', 'season', 'is_holiday',
       'proportion_stars_actors', 'proportion_stars_producers',
       'proportion_stars_director', 'distributor_avg_frequency',
       'genre_action', 'genre_animation', 'genre_arts martiaux',
       'genre_aventure', 'genre_biopic', 'genre_bollywood', 'genre_comédie',
       'genre_comédie dramatique', 'genre_comédie musicale', 'genre_divers',
       'genre_drame', 'genre_epouvante-horreur', 'genre_erotique',
       'genre_espionnage', 'genre_expérimental', 'genre_famille',
       'genre_fantastique', 'genre_guerre', 'genre_historique',
       'genre_judiciaire', 'genre_musical', 'genre_policier', 'genre_péplum',
       'genre_romance', 'genre_science fiction', 'genre_sport event',
       'genre_thriller', 'genre_western'],
      dtype='object')

In [12]:
# Importer les bibliothèques nécessaires
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error
import optuna
from optuna.samplers import TPESampler
import pandas as pd
import numpy as np

# Définir les paramètres du préprocesseur
numerical_features = ['duration', 'press_eval', 'views', 'budget', 'Temperature Moyenne', 'proportion_stars_actors', 'proportion_stars_producers', 'proportion_stars_director', 'distributor_avg_frequency']
categorical_features = ['nationality', 'season', 'is_holiday']

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder="passthrough"
)

# Définir la fonction objectif pour Optuna
def objective(trial):
    # Définir les hyperparamètres à optimiser
    params = {
        'iterations': 1000,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 10.0, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-3, 10.0, log=True),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'thread_count': 4,
        'verbose': False
    }
    
    # Instancier le modèle CatBoostRegressor avec les hyperparamètres spécifiés
    model = CatBoostRegressor(**params, random_state=42)

    # Créer le pipeline avec le préprocesseur et le modèle
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Entraîner le modèle
    pipeline.fit(X_train, y_train)

    # Faire des prédictions sur les données de validation
    y_pred = pipeline.predict(X_test)

    # Calculer la métrique d'évaluation (par exemple, le R2 score)
    score = r2_score(y_test, y_pred)

    return score

# Créer l'étude Optuna
sampler = TPESampler(seed=42)  # Make the sampler behave in a deterministic way.
study = optuna.create_study(sampler=sampler, direction='maximize')

# Lancer l'optimisation des hyperparamètres
study.optimize(objective, n_trials=100)

# Obtenir les meilleurs hyperparamètres trouvés
best_params = study.best_params

# Créer le pipeline final avec les meilleurs hyperparamètres
best_model = CatBoostRegressor(**best_params)

pipeline_final = Pipeline([
    ('preprocessor', preprocessor),
    ('model', best_model)
])

# Entraîner le modèle final sur l'ensemble des données d'entraînement
pipeline_final.fit(X_train, y_train)

# Faire des prédictions sur de nouvelles données
y_pred = pipeline_final.predict(X_test)

# Calculer la métrique d'évaluation finale (par exemple, le R2 score)
final_r2 = r2_score(y_test, y_pred)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Meilleurs hyperparamètres trouvés :", best_params)
print("R2 Score sur les données de test :", final_r2)
print(final_rmse)



[I 2023-07-26 16:14:08,624] A new study created in memory with name: no-name-65b7bd1b-0fec-4391-adc4-05c34e6b3a1a


[I 2023-07-26 16:14:22,945] Trial 0 finished with value: 0.6501751940654332 and parameters: {'learning_rate': 0.03574712922600244, 'depth': 10, 'l2_leaf_reg': 0.8471801418819978, 'random_strength': 0.24810409748678125, 'border_count': 66}. Best is trial 0 with value: 0.6501751940654332.
[I 2023-07-26 16:14:24,060] Trial 1 finished with value: 0.7142810843410028 and parameters: {'learning_rate': 0.01699897838270077, 'depth': 3, 'l2_leaf_reg': 2.9154431891537547, 'random_strength': 0.2537815508265665, 'border_count': 190}. Best is trial 1 with value: 0.7142810843410028.
[I 2023-07-26 16:14:37,514] Trial 2 finished with value: 0.6069017425822751 and parameters: {'learning_rate': 0.010725209743171997, 'depth': 10, 'l2_leaf_reg': 2.1368329072358767, 'random_strength': 0.0070689749506246055, 'border_count': 72}. Best is trial 1 with value: 0.7142810843410028.
[I 2023-07-26 16:14:39,164] Trial 3 finished with value: 0.6759703054667504 and parameters: {'learning_rate': 0.018659959624904916, 'd

0:	learn: 278061.7779950	total: 4.43ms	remaining: 4.42s
1:	learn: 276776.6019832	total: 8.55ms	remaining: 4.27s
2:	learn: 275430.5599942	total: 13.3ms	remaining: 4.43s
3:	learn: 274217.7816561	total: 17.2ms	remaining: 4.28s
4:	learn: 273051.3994417	total: 21.1ms	remaining: 4.19s
5:	learn: 271816.9456656	total: 25.4ms	remaining: 4.21s
6:	learn: 270509.2404306	total: 30.4ms	remaining: 4.31s
7:	learn: 269169.8620462	total: 34.6ms	remaining: 4.29s
8:	learn: 268117.2872756	total: 38.6ms	remaining: 4.25s
9:	learn: 266961.4324976	total: 42.9ms	remaining: 4.25s
10:	learn: 265937.1524996	total: 47.9ms	remaining: 4.31s
11:	learn: 264896.7747651	total: 53ms	remaining: 4.36s
12:	learn: 263668.0092808	total: 57ms	remaining: 4.33s
13:	learn: 262747.8547996	total: 58.1ms	remaining: 4.09s
14:	learn: 261620.6066192	total: 63.4ms	remaining: 4.16s
15:	learn: 260551.3477194	total: 69.2ms	remaining: 4.25s
16:	learn: 259532.7905161	total: 74.7ms	remaining: 4.32s
17:	learn: 258488.1825516	total: 80.3ms	remai

In [15]:
# Importer les bibliothèques nécessaires
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error
import pandas as pd
import numpy as np

# Définir les paramètres du préprocesseur
numerical_features = ['duration', 'press_eval', 'views', 'budget', 'Temperature Moyenne', 'proportion_stars_actors', 'proportion_stars_producers', 'proportion_stars_director', 'distributor_avg_frequency']
categorical_features = ['nationality', 'season', 'is_holiday']

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder="passthrough"
)

best_params = {'learning_rate': 0.010122511979877855, 'depth': 8, 'l2_leaf_reg': 0.0019004975570217878, 'random_strength': 3.525824187643962, 'border_count': 175}
# Créer le modèle avec les meilleurs hyperparamètres
model = CatBoostRegressor(
    **best_params
)

pipe = Pipeline([
    ('preprocessor',preprocessor),
    ('model', model)
])

# Entraîner le modèle final sur l'ensemble des données d'entraînement
# pipe.fit(X_train, y_train)

# # Faire des prédictions sur de nouvelles données
# y_pred = pipeline_final.predict(X_test)

# # Calculer la métrique d'évaluation finale (par exemple, le R2 score)
# final_score = r2_score(y_test, y_pred)
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# print(rmse, final_score)

In [16]:
import mlflow

experiment_id = mlflow.set_experiment("cinema").experiment_id

run_name = "catboost_std_onehot"

with mlflow.start_run(experiment_id=experiment_id, run_name=run_name) as run:
    # Log the baseline model to MLflow
    pipe.fit(X_train, y_train)
    
    
    mlflow.sklearn.log_model(pipe, run_name)

    
    model_uri = mlflow.get_artifact_uri(run_name)

    # Log des paramètres
    mlflow.log_params({'learning_rate': 0.010122511979877855, 'depth': 8, 'l2_leaf_reg': 0.0019004975570217878, 'random_strength': 3.525824187643962, 'border_count': 175})
    
    
    # Log des Tags
    mlflow.set_tag("model","catboost")
    mlflow.set_tag("scaler","StandardScaler")
    mlflow.set_tag("encoder",'OneHotEncoder')
    mlflow.set_tag('df','avec_meteo')

    eval_data = X_test
    eval_data["label"] = y_test

    # Evaluate the logged model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="regressor",
        evaluators=["default"],
   )

0:	learn: 278061.7779950	total: 4.57ms	remaining: 4.56s
1:	learn: 276776.6019832	total: 8.58ms	remaining: 4.28s
2:	learn: 275430.5599942	total: 13.1ms	remaining: 4.36s
3:	learn: 274217.7816561	total: 17.6ms	remaining: 4.39s
4:	learn: 273051.3994417	total: 21.9ms	remaining: 4.36s
5:	learn: 271816.9456656	total: 26.4ms	remaining: 4.37s
6:	learn: 270509.2404306	total: 30.7ms	remaining: 4.36s
7:	learn: 269169.8620462	total: 35.8ms	remaining: 4.44s
8:	learn: 268117.2872756	total: 39.8ms	remaining: 4.38s
9:	learn: 266961.4324976	total: 44ms	remaining: 4.36s
10:	learn: 265937.1524996	total: 48ms	remaining: 4.32s
11:	learn: 264896.7747651	total: 52.5ms	remaining: 4.32s
12:	learn: 263668.0092808	total: 56.9ms	remaining: 4.32s
13:	learn: 262747.8547996	total: 58.1ms	remaining: 4.09s
14:	learn: 261620.6066192	total: 62.4ms	remaining: 4.1s
15:	learn: 260551.3477194	total: 66.3ms	remaining: 4.08s
16:	learn: 259532.7905161	total: 71.8ms	remaining: 4.15s
17:	learn: 258488.1825516	total: 76.4ms	remain

2023/07/26 16:22:51 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
