In [1]:
import pandas as pd 


df_cleaned = pd.read_csv('df_cleaned.csv',index_col=0)

In [2]:
from sklearn.model_selection import train_test_split

X = df_cleaned.drop(['box_office_first_week','title'],axis=1)
y = df_cleaned['box_office_first_week']
X_train,X_test, y_train,y_test = train_test_split(X,y, train_size=0.8,shuffle=True, random_state=42)

In [3]:
X.columns

Index(['release_date', 'genre', 'duration', 'director', 'producers', 'cast',
       'nationality', 'distributor', 'press_eval', 'viewers_eval', 'views',
       'budget', 'Temperature Moyenne', 'Lille', 'Bordeaux', 'Lyon', 'Paris',
       'saison', 'year'],
      dtype='object')

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

categorical_transformer = OneHotEncoder()
categorical_features = ['nationality', 'distributor', 'saison']

numerical_transformer = StandardScaler()
numerical_features = ['duration', 'press_eval', 'viewers_eval', 'views', 'budget', 'Temperature Moyenne', 'Lille', 'Bordeaux', 'Lyon', 'Paris', 'year']

# Fonction personnalisée pour appliquer MultiLabelBinarizer sur les colonnes spécifiées
def apply_mlb_on_columns(column):
    mlb = MultiLabelBinarizer()
    result = mlb.fit_transform(column)
    return pd.DataFrame(result, columns=mlb.classes_)

# Colonnes qui nécessitent MultiLabelBinarizer
multi_label_binarizer_features = ['genre', 'director', 'producers', 'cast']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('mlb', apply_mlb_on_columns, multi_label_binarizer_features),  # Utilisation de la fonction personnalisée
    ],
    remainder="passthrough"
)

model = RandomForestRegressor(n_estimators=100, random_state=42)

pipe = Pipeline([
     ('preprocessor', preprocessor),
     ('model', model)
])

# Convertir les listes en tuples pour que MultiLabelBinarizer puisse les traiter correctement
X_train['genre'] = X_train['genre'].apply(tuple)
X_train['director'] = X_train['director'].apply(tuple)
X_train['producers'] = X_train['producers'].apply(tuple)
X_train['cast'] = X_train['cast'].apply(tuple)

# Convertir les listes en tuples pour les données de test également
X_test['genre'] = X_test['genre'].apply(tuple)
X_test['director'] = X_test['director'].apply(tuple)
X_test['producers'] = X_test['producers'].apply(tuple)
X_test['cast'] = X_test['cast'].apply(tuple)

pipe.fit(X_train, y_train)

# Faire des prédictions sur le test set
y_pred = pipe.predict(X_test)

# Évaluer les performances du modèle
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)




TypeError: All estimators should implement fit and transform, or can be 'drop' or 'passthrough' specifiers. '<function apply_mlb_on_columns at 0x7f282cb797e0>' (type <class 'function'>) doesn't.