In [1]:
import pandas as pd 

df = pd.read_csv('df_model.csv',index_col=0 )
from sklearn.model_selection import train_test_split

X = df.drop(['box_office_first_week','genre','Lille', 'Bordeaux', 'Lyon', 'Paris'],axis=1)
y = df['box_office_first_week']
X_train,X_test, y_train,y_test = train_test_split(X,y, train_size=0.9,shuffle=True, random_state=42)

In [2]:
# Importer les bibliothèques nécessaires
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score
import optuna
from optuna.samplers import TPESampler
import pandas as pd

# Définir les paramètres du préprocesseur
numerical_features = ['duration', 'press_eval', 'viewers_eval', 'views', 'budget', 'Temperature Moyenne', 'proportion_stars_actors', 'proportion_stars_producers', 'proportion_stars_director', 'distributor_avg_frequency']
categorical_features = ['nationality', 'season', 'is_holiday']

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder="passthrough"
)

# Effectuer le prétraitement sur les données d'entraînement pour obtenir les noms de colonnes après l'encodage one-hot
X_train_transformed = preprocessor.fit_transform(X_train)

# Obtenir les noms de colonnes après l'encodage one-hot
ohe = preprocessor.named_transformers_['cat']
ohe_feature_names = ohe.get_feature_names_out(input_features=categorical_features)

# Combiner les noms de colonnes numériques avec ceux de l'encodage one-hot
all_feature_names = numerical_features + list(ohe_feature_names)

# Définir la fonction objective pour Optuna
def objective(trial):
    # Définir les hyperparamètres à optimiser pour la feature selection
    feature_selection_params = {
        'max_features': trial.suggest_int('max_features', 1, len(all_feature_names)),
        'feature_importance_threshold': trial.suggest_float('feature_importance_threshold', 0.0, 0.1)
    }
    
    # Créer le modèle ExtraTreesRegressor avec les autres hyperparamètres
    params = {
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 200, 300, 400, 500]),
        'criterion': trial.suggest_categorical('criterion', ['poisson', 'friedman_mse', 'squared_error', 'absolute_error']),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42
    }
    model = ExtraTreesRegressor(**params)

    # Créer le pipeline avec le préprocesseur et le modèle
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Sélectionner les colonnes du DataFrame d'entraînement en fonction des features sélectionnées
    selected_features = all_feature_names[:feature_selection_params['max_features']]
    X_train_selected = X_train_transformed[:, [all_feature_names.index(feat) for feat in selected_features]]

    # Entraîner le modèle sur les features sélectionnées
    model.fit(X_train_selected, y_train)

    # Faire des prédictions sur les données de validation
    X_test_transformed = preprocessor.transform(X_test)
    X_test_selected = X_test_transformed[:, [all_feature_names.index(feat) for feat in selected_features]]
    y_pred = model.predict(X_test_selected)

    # Calculer la métrique d'évaluation (par exemple, le R2 score)
    score = r2_score(y_test, y_pred)

    return score

# Créer l'étude Optuna
sampler = TPESampler(seed=42)  # Make the sampler behave in a deterministic way.
study = optuna.create_study(sampler=sampler, direction='maximize')

# Lancer l'optimisation des hyperparamètres
study.optimize(objective, n_trials=100)

# Obtenir les meilleurs hyperparamètres trouvés
best_params = study.best_params

# Créer le modèle final avec les meilleurs hyperparamètres
best_model = ExtraTreesRegressor(**best_params)

pipeline_final = Pipeline([
    ('preprocessor', preprocessor),
    ('model', best_model)
])

# Entraîner le modèle final sur l'ensemble des données d'entraînement
pipeline_final.fit(X_train, y_train)

# Faire des prédictions sur de nouvelles données
y_pred = pipeline_final.predict(X_test)

# Calculer la métrique d'évaluation finale (par exemple, le R2 score)
final_score = r2_score(y_test, y_pred)

print("Meilleurs hyperparamètres trouvés :", best_params)
print("R2 Score sur les données de test :", final_score)




[I 2023-07-25 15:33:29,423] A new study created in memory with name: no-name-8c92f3f1-e0b0-46a1-a0c8-ac1033f8683f
[W 2023-07-25 15:33:29,427] Trial 0 failed with parameters: {'max_features': 29, 'feature_importance_threshold': 0.09507143064099162, 'n_estimators': 100, 'criterion': 'poisson', 'max_depth': 15, 'min_samples_split': 17, 'min_samples_leaf': 3} because of the following error: ValueError('Cannot set different distribution kind to the same parameter name.').
Traceback (most recent call last):
  File "/home/apprenant/miniconda3/envs/machine-learning/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_68732/3208189870.py", line 51, in objective
    'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
  File "/home/apprenant/miniconda3/envs/machine-learning/lib/python3.10/site-packages/optuna/trial/_trial.py", line 405, in suggest_categorical
    return self._sugges

ValueError: Cannot set different distribution kind to the same parameter name.

In [9]:
# Importer les bibliothèques nécessaires
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score, mean_squared_error
import optuna
from optuna.samplers import TPESampler
import pandas as pd
import numpy as np

# Définir les paramètres du préprocesseur
numerical_features = ['duration', 'press_eval', 'views', 'budget', 'Temperature Moyenne', 'proportion_stars_actors', 'proportion_stars_producers', 'proportion_stars_director', 'distributor_avg_frequency']
categorical_features = ['nationality', 'season', 'is_holiday']

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder="passthrough"
)

model = ExtraTreesRegressor()

pipe = Pipeline([
    ('preprocessor',preprocessor),
    ('model', model)
])

# Entraîner le modèle final sur l'ensemble des données d'entraînement
pipe.fit(X_train, y_train)

# Faire des prédictions sur de nouvelles données
y_pred = pipe.predict(X_test)

# Calculer la métrique d'évaluation finale (par exemple, le R2 score)
final_score = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(rmse, final_score)


171911.18665790447 0.725774124615053


In [6]:
final_score

0.7310790986550693