In [1]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

file_path = 'Rotten_Tomatoes_Movies3.csv'
df = pd.read_csv(file_path)

df = df.dropna(subset=['audience_rating'])

X = df.drop(columns=['audience_rating'])
y = df['audience_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

cv_scores = cross_val_score(model_pipeline, X_train, y_train, cv=5, scoring='r2')
print("Cross-Validation R^2 Scores:", cv_scores)
print("Mean Cross-Validation R^2 Score:", np.mean(cv_scores))

param_grid = {
    'regressor__n_estimators': [50, 100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(model_pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

joblib.dump(best_model, 'audience_rating_pipeline.joblib')
print("Pipeline saved as 'audience_rating_pipeline.joblib'")


Cross-Validation R^2 Scores: [0.47675172 0.49452793 0.47921292 0.50577859 0.50847733]
Mean Cross-Validation R^2 Score: 0.4929496981147029
Best Parameters: {'regressor__max_depth': 20, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 200}
Mean Squared Error: 207.68653618765592
R^2 Score: 0.4967798727013619
Pipeline saved as 'audience_rating_pipeline.joblib'
