In [1]:
import pandas as pd 

df = pd.read_csv('df_model_sans_meteo2.csv',index_col=0 )

In [2]:
from sklearn.model_selection import train_test_split

X = df.drop(['box_office_first_week','genre'],axis=1)
y = df['box_office_first_week']
X_train,X_test, y_train,y_test = train_test_split(X,y, train_size=0.9,shuffle=True, random_state=42)

In [3]:
X.columns

Index(['duration', 'nationality', 'press_eval', 'viewers_eval', 'views',
       'budget', 'season', 'is_holiday', 'proportion_stars_actors',
       'proportion_stars_producers', 'proportion_stars_director',
       'distributor_avg_frequency', 'genre_action', 'genre_animation',
       'genre_arts martiaux', 'genre_aventure', 'genre_biopic',
       'genre_bollywood', 'genre_comédie', 'genre_comédie dramatique',
       'genre_comédie musicale', 'genre_divers', 'genre_drame',
       'genre_epouvante-horreur', 'genre_erotique', 'genre_espionnage',
       'genre_expérimental', 'genre_famille', 'genre_fantastique',
       'genre_guerre', 'genre_historique', 'genre_judiciaire', 'genre_musical',
       'genre_policier', 'genre_péplum', 'genre_romance',
       'genre_science fiction', 'genre_sport event', 'genre_thriller',
       'genre_western'],
      dtype='object')

In [4]:
# Importer les bibliothèques nécessaires
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error
import optuna
from optuna.samplers import TPESampler
import pandas as pd
import numpy as np

# Définir les paramètres du préprocesseur
numerical_features = ['duration', 'press_eval', 'views', 'budget','proportion_stars_actors', 'proportion_stars_producers', 'proportion_stars_director', 'distributor_avg_frequency']
categorical_features = ['nationality', 'season', 'is_holiday']

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder="passthrough"
)
def objective(trial):
    # Définir les hyperparamètres à optimiser
    params = {
        'iterations': 1000,  # Nombre d'itérations (vous pouvez augmenter cela)
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'thread_count': 4,
        'verbose': False
    }
    
    # Instancier le modèle CatBoostRegressor avec les hyperparamètres spécifiés
    model = CatBoostRegressor(**params, random_state=42)
    

    # Créer le pipeline avec le préprocesseur et le modèle
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Entraîner le modèle
    pipeline.fit(X_train, y_train)

    # Faire des prédictions sur les données de validation
    y_pred = pipeline.predict(X_test)

    # Calculer la métrique d'évaluation (par exemple, le R2 score)
    score = r2_score(y_test, y_pred)

    return score

# Créer l'étude Optuna
sampler = TPESampler(seed=42)  # Make the sampler behave in a deterministic way.
study = optuna.create_study(sampler=sampler, direction='maximize')

# Lancer l'optimisation des hyperparamètres
study.optimize(objective, n_trials=100)

# Obtenir les meilleurs hyperparamètres trouvés
best_params = study.best_params

# Créer le pipeline final avec les meilleurs hyperparamètres
best_model = CatBoostRegressor(**best_params)

pipeline_final = Pipeline([
    ('preprocessor', preprocessor),
    ('model', best_model)
])

# Entraîner le modèle final sur l'ensemble des données d'entraînement
pipeline_final.fit(X_train, y_train)

# Faire des prédictions sur de nouvelles données
y_pred = pipeline_final.predict(X_test)

# Calculer la métrique d'évaluation finale (par exemple, le R2 score)
final_r2 = r2_score(y_test, y_pred)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Meilleurs hyperparamètres trouvés :", best_params)
print("R2 Score sur les données de test :", final_r2)
print(final_rmse)



[I 2023-07-26 15:02:13,889] A new study created in memory with name: no-name-ff96fe29-0344-4394-a531-e338158411ca
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
  'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
[I 2023-07-26 15:02:25,543] Trial 0 finished with value: 0.6652627871910658 and parameters: {'learning_rate': 0.03574712922600244, 'depth': 10, 'l2_leaf_reg': 0.8471801418819978, 'random_strength': 0.24810409748678125, 'border_count': 66}. Best is trial 0 with value: 0.6652627871910658.
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
  'random_strength': trial.suggest_loguniform('random_strength', 1e-3, 10.0),
[I 2023-07-26 15:02:26,596] Trial 1 finished with value: 0.6655200188290804 and parameters: {'learning_rate': 0.01699897838270077, 'depth': 3, 'l2_leaf_reg'

0:	learn: 266949.6283048	total: 2.33ms	remaining: 2.32s
1:	learn: 255360.5339214	total: 4.88ms	remaining: 2.44s
2:	learn: 246571.6734912	total: 7.22ms	remaining: 2.4s
3:	learn: 239184.3155529	total: 9.63ms	remaining: 2.4s
4:	learn: 230933.0696324	total: 11.8ms	remaining: 2.36s
5:	learn: 224076.0710797	total: 15ms	remaining: 2.48s
6:	learn: 217259.4586800	total: 17.5ms	remaining: 2.48s
7:	learn: 211420.2269282	total: 19.9ms	remaining: 2.46s
8:	learn: 206143.2579247	total: 22.4ms	remaining: 2.46s
9:	learn: 201706.4671104	total: 25.2ms	remaining: 2.5s
10:	learn: 198239.7810476	total: 27.4ms	remaining: 2.47s
11:	learn: 194354.9458537	total: 30.2ms	remaining: 2.49s
12:	learn: 191329.9850967	total: 32.6ms	remaining: 2.48s
13:	learn: 188852.8464933	total: 34.5ms	remaining: 2.43s
14:	learn: 185984.5445840	total: 37ms	remaining: 2.43s
15:	learn: 184010.2408058	total: 39.6ms	remaining: 2.44s
16:	learn: 181806.2826624	total: 43.4ms	remaining: 2.51s
17:	learn: 179874.1463286	total: 46.3ms	remainin

In [None]:
# Importer les bibliothèques nécessaires
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score, mean_squared_error
import optuna
from optuna.samplers import TPESampler
import pandas as pd
import numpy as np

# Définir les paramètres du préprocesseur
numerical_features = ['duration', 'press_eval', 'views', 'budget', 'proportion_stars_actors', 'proportion_stars_producers', 'proportion_stars_director', 'distributor_avg_frequency']
categorical_features = ['nationality', 'season', 'is_holiday']

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder="passthrough"
)

best_params = {
    'n_estimators': 100,
    'criterion': 'squared_error',
    'max_depth': 14,
    'min_samples_split': 3,
    'min_samples_leaf': 3,
    'max_features': None,
    'bootstrap': False
}
# Créer le modèle avec les meilleurs hyperparamètres
model = ExtraTreesRegressor(
    n_estimators=best_params['n_estimators'],
    criterion=best_params['criterion'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    bootstrap=best_params['bootstrap']
)

pipe = Pipeline([
    ('preprocessor',preprocessor),
    ('model', model)
])



In [None]:
import mlflow

experiment_id = mlflow.set_experiment("cinema").experiment_id

run_name = "extratree_std_onehot"

with mlflow.start_run(experiment_id=experiment_id, run_name=run_name) as run:
    # Log the baseline model to MLflow
    pipe.fit(X_train, y_train)
    
    
    mlflow.sklearn.log_model(pipe, run_name)

    
    model_uri = mlflow.get_artifact_uri(run_name)

    # Log des paramètres
    mlflow.log_params({'n_estimators': 100,'criterion': 'squared_error','max_depth': 14, 'min_samples_split': 3,'min_samples_leaf': 3,
    'max_features': None,
    'bootstrap': False})
    
    
    # Log des Tags
    mlflow.set_tag("model","extra_tree")
    mlflow.set_tag("scaler","StandardScaler")
    mlflow.set_tag("encoder",'OneHotEncoder')
    mlflow.set_tag('df','sans_meteo')

    eval_data = X_test
    eval_data["label"] = y_test

    # Evaluate the logged model
    result = mlflow.evaluate(
        model_uri,
        eval_data,
        targets="label",
        model_type="regressor",
        evaluators=["default"],
   )

2023/07/26 14:02:27 INFO mlflow.models.evaluation.base: Evaluating the model with the default evaluator.
