In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.impute import SimpleImputer, KNNImputer
import numpy as np

# chargement jeu de donnée avec le moins de features
df = pd.read_csv('movies_clean_final2.csv')
df.head(20)



Unnamed: 0,acteurs,budget,compositeur,duree,entrees_premiere_semaine,franchise,genre,pays,producteur,realisateur,remake,salles_premiere_semaine,studio,titre,scoring_acteurs,scoring_acteurs_realisateurs,season,coeff_studio,year
0,"Dany Boon, Kad Merad, Michel Galabru, Jérôme C...",12547392,,106,4378720,,Comédie,France,Claude Berri,Dany Boon,Remake,793,Pathé,Bienvenue chez les Ch'tis,0.275862,0.482759,Hiver,2,2008
1,"Christian Clavier, Jamel Debbouze, Gérard Depa...",50300000,,107,3685097,Franchise,Comédie,France,Claude Berri,Alain Chabat,,945,Pathé,Astérix et Obélix: Mission Cléopatre,1.310345,1.344828,Hiver,2,2002
2,"Ewan McGregor, Natalie Portman, Samuel L. Jackson",113000000,John Williams,146,3303005,Franchise,Fantasy,Etats-Unis,George Lucas,George Lucas,,938,20th Century Fox,La Revanche des Sith,0.448276,0.448276,Printemps,3,2005
3,"Chris Evans, Chris Hemsworth, Robert Downey Jr...",356000000,Alan Silvestri,181,3426471,Franchise,Comicbook,Etats-Unis,Kevin Feige,Russo (brothers),,633,Walt Disney Pictures,Avengers: Endgame,0.931034,0.931034,Printemps,3,2019
4,"Sam Worthington, Sigourney Weaver, Kate Winslet",350000000,,192,2739848,Franchise,Science Fiction,Etats-Unis,,James Cameron,,762,Walt Disney Pictures,Avatar : la voie de l'eau,0.448276,0.724138,Hiver,3,2022
5,"Benedict Cumberbatch, Tom Holland, Willem Dafo...",200000000,,148,2867515,Franchise,Comicbook,Etats-Unis,Kevin Feige,Jon Watts,,882,Sony Pictures,Spider-Man: No Way Home,0.0,0.0,Hiver,3,2021
6,"Daniel Radcliffe, Emma Watson, Helena Bonham C...",200000000,,153,2882397,Franchise,Fantasy,Etats-Unis,,David Yates,,949,Warner Bros.,Harry Potter et le Prince de sang-mêlé,0.482759,0.482759,Été,3,2009
7,"Orlando Bloom, Viggo Mortensen, Ian McKellen, ...",94000000,Howard Shore,201,2852784,Franchise,Fantasy,Etats-Unis,,Peter Jackson,,993,Metropolitan,Le Seigneur des anneaux: Le Retour du roi,1.034483,1.034483,Hiver,0,2003
8,"Richard Anconina, José Garcia, Gad Elmaleh",12560000,,105,2830489,Franchise,Comédie,France,,Thomas Gilou,,827,Warner Bros.,La Vérité si je mens! 2,0.034483,0.034483,Hiver,3,2001
9,"Gérard Depardieu, Christian Clavier, Roberto B...",41900000,,109,2718443,Franchise,Comédie,France,Claude Berri,Claude Zidi,,780,Pathé,Astérix et Obélix contre César,0.517241,0.517241,Hiver,2,1999


In [2]:
df.shape

(3792, 19)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
import pandas as pd
df = pd.read_csv('movies_clean_final2.csv')

df = df.drop(['acteurs', 'scoring_acteurs', 'realisateur', 'titre'], axis=1)

# Gestion des valeurs manquantes pour les colonnes numériques et catégorielles
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.drop('entrees_premiere_semaine')
numeric_transformer = SimpleImputer(strategy='median')

categorical_features = df.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Préparation du préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    #     ('poly', PolynomialFeatures(degree=2, include_bias=False), numeric_features)
    ])

# Modèle de régression
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=0))])

# Séparation des données
X = df.drop('entrees_premiere_semaine', axis=1)
y = df['entrees_premiere_semaine']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Entraînement du modèle
model.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluation du modèle
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
print(mae)
print(r2)
print(rmse)

128319.3575889328
0.7519012677070012
247292.37257577275


In [4]:
feature_names = model[:-1].get_feature_names_out(X.columns)
feature_names
len(feature_names)


190

In [5]:
coefficients = model[-1].feature_importances_
result = list(zip(feature_names, coefficients))
result

[('num__budget', 0.050418481290162284),
 ('num__duree', 0.025999528540838816),
 ('num__salles_premiere_semaine', 0.6934885485697208),
 ('num__scoring_acteurs_realisateurs', 0.03175556613358896),
 ('num__coeff_studio', 0.0036150319766448877),
 ('num__year', 0.061647089275761374),
 ('cat__compositeur_- Junkie XL', 0.00013817391540543633),
 ('cat__compositeur_Alan Silvestri', 0.001389069480538742),
 ('cat__compositeur_Alexandre Desplat', 0.0018704199361998278),
 ('cat__compositeur_Brian Tyler', 0.002363948016501756),
 ('cat__compositeur_Danny Elfman', 0.0009759048708343151),
 ('cat__compositeur_Ennio Morricone', 7.32446525450476e-06),
 ('cat__compositeur_Eric Serra', 0.0017001215431199446),
 ('cat__compositeur_Hans Zimmer', 0.0019232986953777401),
 ('cat__compositeur_Howard Shore', 0.00041628631249805513),
 ('cat__compositeur_James Horner', 8.841806492219165e-05),
 ('cat__compositeur_James Newton Howard', 0.0005914045171668029),
 ('cat__compositeur_Jerry Goldsmith', 4.1083425472999775e-05

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, r2_score
import pandas as pd
df = pd.read_csv('movies_clean_final2.csv')

df = df.drop(['acteurs', 'scoring_acteurs', 'realisateur', 'titre', 'season', 'studio', 'compositeur', 'producteur'], axis=1)

# Gestion des valeurs manquantes pour les colonnes numériques et catégorielles
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.drop('entrees_premiere_semaine')
numeric_transformer = SimpleImputer(strategy='median')

categorical_features = df.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Préparation du préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    #     ('poly', PolynomialFeatures(degree=2, include_bias=False), numeric_features)
    ])

# Modèle de régression
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(n_estimators=100, random_state=0))])

# Séparation des données
X = df.drop('entrees_premiere_semaine', axis=1)
y = df['entrees_premiere_semaine']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Entraînement du modèle
model.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluation du modèle
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
print(mae)
print(r2)
print(rmse)


130583.11869565219
0.7525548813753369
246966.41308912687


In [7]:
feature_names = model[:-1].get_feature_names_out(X.columns)
feature_names
len(feature_names)


56

In [8]:
coefficients = model[-1].feature_importances_
result = list(zip(feature_names, coefficients))
result

[('num__budget', 0.06726888946890146),
 ('num__duree', 0.03896811110107842),
 ('num__salles_premiere_semaine', 0.7093471504678701),
 ('num__scoring_acteurs_realisateurs', 0.04400284663332199),
 ('num__coeff_studio', 0.006771001868861111),
 ('num__year', 0.0736403592023163),
 ('cat__franchise_Franchise', 0.007240196290191954),
 ('cat__franchise_missing', 0.005951776336230578),
 ('cat__genre_Animation', 0.0035396342751897168),
 ('cat__genre_Aventure - Action', 0.0031440621835093032),
 ('cat__genre_Catastrophe', 0.0009556865260238798),
 ('cat__genre_Comicbook', 0.0038069279152924676),
 ('cat__genre_Comédie', 0.00405032549438643),
 ('cat__genre_Comédie dramatique', 0.0006185093030902756),
 ('cat__genre_Court-métrage', 1.514952464727042e-08),
 ('cat__genre_Documentaire', 6.177113335894321e-05),
 ('cat__genre_Drame', 0.002114110772440697),
 ('cat__genre_Fantasy', 0.00771508945848083),
 ('cat__genre_Film familial', 0.0012013333243301079),
 ('cat__genre_Guerre', 0.00015945297645112152),
 ('cat

In [9]:
new_films = pd.DataFrame({
    'titre': ['NOUS, LES LEROY'],  # Assurez-vous que les valeurs soient dans des listes
    'budget': [None],
    'genre': ['Comédie'],
    'pays': ['France'],
    'duree' : [103],
    #'acteurs': [['George Clooney', 'CA', 'NJ']],  # Cela dépend de comment vous avez traité cette caractéristique
    'producteur': ['missing'],
    # 'realisateur': ['Florent Bernard'],
    'compositeur': ['missing'],
    'studio': ['Apollo Films'],
    'coeff_studio': [1],
    'year': [2024],
    'season': ['Printemps'],
    'scoring_acteurs_realisateurs': [0.5],
    'remake': ['missing'],
    'franchise': ['missing'],
    'salles_premiere_semaine' : [414]
})
# faire des prédictions avec le modèle optimisé
new_y_pred = model.predict(new_films)

# ajouter les prédictions à new_data
new_films['pred_entrees'] = new_y_pred
new_films['pred_entrees_dans_cine_jour'] = (new_y_pred/2000)/7
print(new_films)

             titre budget    genre    pays  duree producteur compositeur  \
0  NOUS, LES LEROY   None  Comédie  France    103    missing     missing   

         studio  coeff_studio  year     season  scoring_acteurs_realisateurs  \
0  Apollo Films             1  2024  Printemps                           0.5   

    remake franchise  salles_premiere_semaine  pred_entrees  \
0  missing   missing                      414     172324.08   

   pred_entrees_dans_cine_jour  
0                    12.308863  


In [10]:
new_films = pd.DataFrame({
    'titre': ['S.O.S. Fantômes : La Menace de glace'],  # Assurez-vous que les valeurs soient dans des listes
    'budget': [None],
    'genre': ['Action'],
    'pays': ['Etats-Unis'],
    'duree' : [116],
    #'acteurs': [['George Clooney', 'CA', 'NJ']],  # Cela dépend de comment vous avez traité cette caractéristique
    'producteur': ['missing'],
    'realisateur': ['Gil Kenan'],
    'compositeur': ['missing'],
    'studio': ['Sony Pictures'],
    'coeff_studio': [2],
    'year': [2024],
    'season': ['Printemps'],
    'scoring_acteurs_realisateurs': [0.5],
    'remake': ['missing'],
    'franchise': ['Franchise'],
    'salles_premiere_semaine' : [670]
})
# faire des prédictions avec le modèle optimisé
new_y_pred = model.predict(new_films)

# ajouter les prédictions à new_data
new_films['pred_entrees'] = new_y_pred
new_films['pred_entrees_dans_cine_jour'] = (new_y_pred/2000)/7
print(new_films)

                                  titre budget   genre        pays  duree  \
0  S.O.S. Fantômes : La Menace de glace   None  Action  Etats-Unis    116   

  producteur realisateur compositeur         studio  coeff_studio  year  \
0    missing   Gil Kenan     missing  Sony Pictures             2  2024   

      season  scoring_acteurs_realisateurs   remake  franchise  \
0  Printemps                           0.5  missing  Franchise   

   salles_premiere_semaine  pred_entrees  pred_entrees_dans_cine_jour  
0                      670     591069.74                    42.219267  


In [11]:
new_films = pd.DataFrame({
    'titre': ['Civil War', 'Hopeless', 'SPY x FAMILY CODE: White', 'Borgo', 'LaRoy'], # Assurez-vous que les valeurs soient dans des listes
    'budget': [None, None, None, None, None],
    'genre': ['Action', 'Policier', 'Animation', 'Drame', 'Policier'],
    'pays': ['Etats-Unis', 'Coree', 'Japon', 'France', 'Etats-Unis'],
    'duree' : [109, 124, 110, 118, 112],
    #'acteurs': [['George Clooney', 'CA', 'NJ']],  # Cela dépend de comment vous avez traité cette caractéristique
    'producteur': ['missing', 'missing', 'missing', 'missing', 'missing'],
    'realisateur': ['Alex Garland', 'Chang-hoon Kim', 'Kazuhiro Furuhashi', 'Stéphane Demoustier', 'Shane Atkinson'],
    'compositeur': ['missing', 'missing', 'missing', 'missing', 'missing'],
    'studio': ['Metropolitan', 'Bac Films', 'Sony Pictures', 'Le Pacte', 'ARP Selection'],
    'coeff_studio': [2, 1, 3, 1, 0],
    'year': [2024, 2024, 2024, 2024, 2024],
    'season': ['Printemps', 'Printemps', 'Printemps', 'Printemps', 'Printemps'],
    'scoring_acteurs_realisateurs': [0.5, 0, 0, 0, 0],
    'remake': ['missing', 'missing', 'missing', 'missing', 'missing'],
    'franchise': ['missing', 'missing', 'missing', 'missing', 'missing'],
    'salles_premiere_semaine' : [680, 41, 346, 380, 146]
})
# faire des prédictions avec le modèle optimisé
new_y_pred = model.predict(new_films)

# ajouter les prédictions à new_data
new_films['pred_entrees'] = new_y_pred
new_films['pred_entrees_dans_cine_jour'] = (new_y_pred/2000)/7
display(new_films)

Unnamed: 0,titre,budget,genre,pays,duree,producteur,realisateur,compositeur,studio,coeff_studio,year,season,scoring_acteurs_realisateurs,remake,franchise,salles_premiere_semaine,pred_entrees,pred_entrees_dans_cine_jour
0,Civil War,,Action,Etats-Unis,109,missing,Alex Garland,missing,Metropolitan,2,2024,Printemps,0.5,missing,missing,680,543935.55,38.852539
1,Hopeless,,Policier,Coree,124,missing,Chang-hoon Kim,missing,Bac Films,1,2024,Printemps,0.0,missing,missing,41,28079.21,2.005658
2,SPY x FAMILY CODE: White,,Animation,Japon,110,missing,Kazuhiro Furuhashi,missing,Sony Pictures,3,2024,Printemps,0.0,missing,missing,346,121721.95,8.694425
3,Borgo,,Drame,France,118,missing,Stéphane Demoustier,missing,Le Pacte,1,2024,Printemps,0.0,missing,missing,380,141583.7,10.113121
4,LaRoy,,Policier,Etats-Unis,112,missing,Shane Atkinson,missing,ARP Selection,0,2024,Printemps,0.0,missing,missing,146,42437.65,3.031261


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
import pandas as pd
df = pd.read_csv('movies_clean_final2.csv')

df = df.drop(['acteurs', 'scoring_acteurs', 'realisateur', 'studio'], axis=1)

# Gestion des valeurs manquantes pour les colonnes numériques et catégorielles
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.drop('entrees_premiere_semaine')
numeric_transformer = SimpleImputer(strategy='median')

categorical_features = df.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Préparation du préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        # ('poly', PolynomialFeatures(degree=1, include_bias=False), numeric_features)  # Ajouter cette ligne pour les fonctionnalités polynomiales
    ])

# Modèle de régression
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', CatBoostRegressor())])

# Séparation des données
X = df.drop('entrees_premiere_semaine', axis=1)
y = df['entrees_premiere_semaine']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Entraînement du modèle
model.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluation du modèle
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mae
r2 = r2_score(y_test, y_pred)
print(r2)
print(rmse)



Learning rate set to 0.048788
0:	learn: 416482.1028476	total: 158ms	remaining: 2m 37s
1:	learn: 404944.1205715	total: 167ms	remaining: 1m 23s
2:	learn: 393458.6344834	total: 176ms	remaining: 58.6s
3:	learn: 383354.2785887	total: 185ms	remaining: 46.1s
4:	learn: 373625.7247013	total: 194ms	remaining: 38.7s
5:	learn: 364799.2958064	total: 204ms	remaining: 33.8s
6:	learn: 356056.6909422	total: 214ms	remaining: 30.3s
7:	learn: 348079.5770804	total: 223ms	remaining: 27.6s
8:	learn: 340785.8814057	total: 232ms	remaining: 25.6s
9:	learn: 334314.5880784	total: 241ms	remaining: 23.9s
10:	learn: 327310.1794592	total: 250ms	remaining: 22.5s
11:	learn: 321214.7227200	total: 259ms	remaining: 21.3s
12:	learn: 315762.6016698	total: 268ms	remaining: 20.3s
13:	learn: 310337.5661347	total: 277ms	remaining: 19.5s
14:	learn: 305162.6500415	total: 285ms	remaining: 18.7s
15:	learn: 300669.8829660	total: 294ms	remaining: 18.1s
16:	learn: 296129.8763633	total: 303ms	remaining: 17.5s
17:	learn: 291770.1612732	

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import xgboost
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_absolute_error
import pandas as pd
df = pd.read_csv('movies_clean_final2.csv')

df = df.drop(['acteurs', 'scoring_acteurs', 'realisateur', 'studio'], axis=1)

# Gestion des valeurs manquantes pour les colonnes numériques et catégorielles
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.drop('entrees_premiere_semaine')
numeric_transformer = SimpleImputer(strategy='median')

categorical_features = df.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Préparation du préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Modèle de régression
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', xgboost.XGBRegressor())])

# Séparation des données
X = df.drop('entrees_premiere_semaine', axis=1)
y = df['entrees_premiere_semaine']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

parameters = {'regressor__nthread':[4], #when use hyperthread, xgboost may become slower
              'regressor__objective':['reg:linear'],
              'regressor__learning_rate': [.03, 0.05, .07], #so called `eta` value
              'regressor__max_depth': [5, 6, 7],
              'regressor__min_child_weight': [4],
              'regressor__silent': [1],
              'regressor__subsample': [0.7],
              'regressor__colsample_bytree': [0.7],
              'regressor__n_estimators': [500]}


# Créer une instance de GridSearchCV
grid_search = GridSearchCV(model, parameters, cv=5, n_jobs=5, verbose=True)

# Effectuer la recherche sur la grille
grid_search.fit(X_train, y_train)

# Meilleurs hyperparamètres trouvés
best_params = grid_search.best_params_

# Meilleur score obtenu sur l'ensemble d'entraînement
best_score = grid_search.best_score_

# Meilleur modèle
best_model = grid_search.best_estimator_

# Utiliser le meilleur modèle pour prédire les étiquettes sur l'ensemble de test
y_pred = best_model.predict(X_test)

print("Meilleurs hyperparamètres:", best_params)
print("Meilleur score sur l'ensemble d'entraînement:", best_score)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.



KeyboardInterrupt: 


KeyboardInterrupt



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
import pandas as pd
df = pd.read_csv('movies_clean_final2.csv')

df = df.drop(['acteurs', 'scoring_acteurs', 'realisateur', 'studio'], axis=1)

# Gestion des valeurs manquantes pour les colonnes numériques et catégorielles
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.drop('entrees_premiere_semaine')
numeric_transformer = SimpleImputer(strategy='median')

categorical_features = df.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Préparation du préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('poly', PolynomialFeatures(degree=2, include_bias=False), numeric_features)  # Ajouter cette ligne pour les fonctionnalités polynomiales
    ])

# Modèle de régression
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', Lasso(alpha=0.1))])

# Séparation des données
X = df.drop('entrees_premiere_semaine', axis=1)
y = df['entrees_premiere_semaine']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Entraînement du modèle
model.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluation du modèle
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mae
r2 = r2_score(y_test, y_pred)
print(r2)
print(rmse)

0.5823497278784961
320852.0608012206


  model = cd_fast.sparse_enet_coordinate_descent(


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Lasso

from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
import pandas as pd
df = pd.read_csv('movies_clean_final2.csv')

df = df.drop(['acteurs', 'scoring_acteurs', 'realisateur', 'studio'], axis=1)

# Gestion des valeurs manquantes pour les colonnes numériques et catégorielles
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.drop('entrees_premiere_semaine')
numeric_transformer = SimpleImputer(strategy='median')

categorical_features = df.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Préparation du préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        # ('poly', PolynomialFeatures(degree=2, include_bias=False), numeric_features)  # Ajouter cette ligne pour les fonctionnalités polynomiales
    ])

# Modèle de régression
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', KNeighborsRegressor(n_jobs=-1))])

# Séparation des données
X = df.drop('entrees_premiere_semaine', axis=1)
y = df['entrees_premiere_semaine']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Entraînement du modèle
model.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test)

# Évaluation du modèle
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
mae
r2 = r2_score(y_test, y_pred)
print(r2)
print(rmse)

0.36475791048545014
395701.85074606515


In [None]:
from pycaret.datasets import get_data
import pandas as pd 
df = df = pd.read_csv('movies_clean_final.csv')

df = df.drop(['acteurs', 'scoring_acteurs', 'realisateur', 'studio'], axis=1)

df.to_csv('df_pycaret.csv')

dataset = get_data('df_pycaret')
#check the shape of data
dataset.shape


Unnamed: 0.1,Unnamed: 0,budget,compositeur,date,duree,entrees_premiere_semaine,franchise,genre,pays,producteur,remake,salles_premiere_semaine,titre,scoring_acteurs_realisateurs,season,coeff_studio
0,0,12547392,,2008-02-27,106,4378720,,Comédie,France,Claude Berri,Remake,793,Bienvenue chez les Ch'tis,0.482759,Hiver,2
1,1,50300000,,2002-01-30,107,3685097,Franchise,Comédie,France,Claude Berri,,945,Astérix et Obélix: Mission Cléopatre,1.344828,Hiver,2
2,2,113000000,John Williams,2005-05-18,146,3303005,Franchise,Fantasy,Etats-Unis,George Lucas,,938,La Revanche des Sith,0.448276,Printemps,3
3,3,356000000,Alan Silvestri,2019-04-24,181,3426471,Franchise,Comicbook,Etats-Unis,Kevin Feige,,633,Avengers: Endgame,0.931034,Printemps,3
4,4,350000000,,2022-12-14,192,2739848,Franchise,Science Fiction,Etats-Unis,,,762,Avatar : la voie de l'eau,0.724138,Hiver,3


(4549, 16)

In [None]:
from pycaret.regression import *

s = setup(data = dataset, target = 'entrees_premiere_semaine', session_id=123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,entrees_premiere_semaine
2,Target type,Regression
3,Original data shape,"(4549, 16)"
4,Transformed data shape,"(4549, 52)"
5,Transformed train set shape,"(3184, 52)"
6,Transformed test set shape,"(1365, 52)"
7,Numeric features,5
8,Categorical features,10
9,Rows with missing values,100.0%


In [None]:
best = compare_models()
print(best)

KNeighborsRegressor(n_jobs=-1)
