In [None]:

# ## Predicting Audience Ratings: Full Pipeline with Explanation

# ### Step 1: Import Libraries
# Import necessary Python libraries for data manipulation, model training, evaluation, and visualization.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# ### Step 2: Load the Dataset
# Load the dataset from the provided Excel file and inspect its structure.
file_path = 'Rotten_Tomatoes_Movies3.xls'  # Update if necessary
data = pd.read_excel(file_path)

# Display basic information about the dataset to understand its structure and contents.
data.info()
print(data.head())

# ### Step 3: Handle Missing Values
# Check for missing values in the dataset and replace missing `audience_rating` values with `tomatometer_rating`.
print("Missing values:\n", data.isnull().sum())
data['audience_rating'].fillna(data['tomatometer_rating'], inplace=True)

# ### Step 4: Define Target and Features
# Separate the target variable (`audience_rating`) from the features.
target = 'audience_rating'
features = data.drop(columns=[target])
target_data = data[target]

# ### Step 5: Encode Categorical Variables
# Encode categorical features using Label Encoding to convert them into numeric format.
categorical_cols = features.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    features[col] = le.fit_transform(features[col])

# ### Step 6: Scale Numerical Features
# Standardize numerical features to ensure they are on the same scale.
numerical_cols = features.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
features[numerical_cols] = scaler.fit_transform(features[numerical_cols])

# ### Step 7: Split Data into Training and Testing Sets
# Divide the dataset into training (80%) and testing (20%) sets to evaluate the model.
X_train, X_test, y_train, y_test = train_test_split(features, target_data, test_size=0.2, random_state=42)

# ### Step 8: Define Models
# Define multiple models to compare their performance. These include Linear Regression, Random Forest, and XGBoost.
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

# ### Step 9: Train and Evaluate Models
# Train each model, evaluate it using cross-validation, and calculate performance metrics on the test set.
results = {}
predictions_dict = {}
for name, model in models.items():
    pipeline = Pipeline([
        ('model', model)
    ])
    # Perform 5-fold cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    predictions_dict[name] = predictions
    
    # Calculate performance metrics
    r2 = r2_score(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    results[name] = {'R²': r2, 'MSE': mse, 'MAE': mae}
    
    print(f"{name} Performance:")
    print(f"R²: {r2}, MSE: {mse}, MAE: {mae}\n")

# ### Step 10: Compare Models
# Identify the best-performing model based on R² score.
best_model_name = max(results, key=lambda x: results[x]['R²'])
best_model = models[best_model_name]
print(f"Best Model: {best_model_name}")

# ### Step 11: Visualize Model Comparisons
# Compare model performances using bar charts.
metrics_df = pd.DataFrame(results).T
metrics_df = metrics_df.sort_values(by='R²', ascending=False)

# Plot R² Scores
plt.figure(figsize=(10, 6))
sns.barplot(x=metrics_df.index, y='R²', data=metrics_df, palette='viridis')
plt.title('R² Scores by Model')
plt.ylabel('R²')
plt.xlabel('Model')
plt.show()

# Plot MSE Scores
plt.figure(figsize=(10, 6))
sns.barplot(x=metrics_df.index, y='MSE', data=metrics_df, palette='viridis')
plt.title('Mean Squared Error by Model')
plt.ylabel('MSE')
plt.xlabel('Model')
plt.show()

# Plot MAE Scores
plt.figure(figsize=(10, 6))
sns.barplot(x=metrics_df.index, y='MAE', data=metrics_df, palette='viridis')
plt.title('Mean Absolute Error by Model')
plt.ylabel('MAE')
plt.xlabel('Model')
plt.show()

# ### Step 12: Validate the Best Model
# Test the best model on the test set and visualize its predictions against actual values.
best_model.fit(X_train, y_train)
best_predictions = best_model.predict(X_test)

# Scatter Plot of Predictions vs Actual Values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, best_predictions, alpha=0.7, label='Predictions')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Ideal Fit')
plt.xlabel('Actual Ratings')
plt.ylabel('Predicted Ratings')
plt.title(f'{best_model_name} Predictions vs Actuals')
plt.legend()
plt.show()

# ### Step 13: Save Predictions and Model
# Save the predictions to a CSV file and the best model to a file for future use.
output = pd.DataFrame({'Actual': y_test, 'Predicted': best_predictions})
output.to_csv('predictions.csv', index=False)

import joblib
joblib.dump(best_model, 'best_model.pkl')

# ### Summary
# This notebook demonstrates a full pipeline for predicting `audience_rating` using multiple models. Model performances are compared using metrics and visualizations, the best model is selected, and its predictions are validated. The predictions and trained model are saved for future use.


In [None]:
# ## Predicting Audience Ratings: Full Pipeline with Explanation

# ### Step 1: Import Libraries
# Import necessary Python libraries for data manipulation, model training, evaluation, and visualization.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import seaborn as sns

# ### Step 2: Load the Dataset
# Load the dataset from the provided Excel file and inspect its structure.
file_path = 'Rotten_Tomatoes_Movies3.xls'  # Update if necessary
data = pd.read_excel(file_path)

# Display basic information about the dataset to understand its structure and contents.
data.info()
print(data.head())

# ### Step 3: Handle Missing Values
# Check for missing values in the dataset and replace missing `audience_rating` values with `tomatometer_rating`.
print("Missing values:\n", data.isnull().sum())
data['audience_rating'].fillna(data['tomatometer_rating'], inplace=True)

# ### Step 4: Define Target and Features
# Separate the target variable (`audience_rating`) from the features.
target = 'audience_rating'
features = data.drop(columns=[target])
target_data = data[target]

# ### Step 5: Encode Categorical Variables
# Encode categorical features using Label Encoding to convert them into numeric format.
categorical_cols = features.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    features[col] = le.fit_transform(features[col])

# ### Step 6: Scale Numerical Features
# Standardize numerical features to ensure they are on the same scale.
numerical_cols = features.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
features[numerical_cols] = scaler.fit_transform(features[numerical_cols])

# ### Step 7: Split Data into Training and Testing Sets
# Divide the dataset into training (80%) and testing (20%) sets to evaluate the model.
X_train, X_test, y_train, y_test = train_test_split(features, target_data, test_size=0.2, random_state=42)

# ### Step 8: Define Models
# Define multiple models to compare their performance. These include Linear Regression, Random Forest, Gradient Boosting, Support Vector Regressor, and XGBoost.
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR(),
    'XGBoost': XGBRegressor(random_state=42)
}

# ### Step 9: Train and Evaluate Models
# Train each model, evaluate it using cross-validation, and calculate performance metrics on the test set.
results = {}
predictions_dict = {}
for name, model in models.items():
    pipeline = Pipeline([
        ('model', model)
    ])
    # Perform 5-fold cross-validation
    scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)
    predictions_dict[name] = predictions
    
    # Calculate performance metrics
    r2 = r2_score(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    results[name] = {'R²': r2, 'MSE': mse, 'MAE': mae}
    
    print(f"{name} Performance:")
    print(f"R²: {r2}, MSE: {mse}, MAE: {mae}\n")

# ### Step 10: Compare Models
# Identify the best-performing model based on R² score.
best_model_name = max(results, key=lambda x: results[x]['R²'])
best_model = models[best_model_name]
print(f"Best Model: {best_model_name}")

# ### Step 11: Visualize Model Comparisons
# Compare model performances using bar charts.
metrics_df = pd.DataFrame(results).T
metrics_df = metrics_df.sort_values(by='R²', ascending=False)

# Plot R² Scores
plt.figure(figsize=(10, 6))
sns.barplot(x=metrics_df.index, y='R²', data=metrics_df, palette='viridis')
plt.title('R² Scores by Model')
plt.ylabel('R²')
plt.xlabel('Model')
plt.show()

# Plot MSE Scores
plt.figure(figsize=(10, 6))
sns.barplot(x=metrics_df.index, y='MSE', data=metrics_df, palette='viridis')
plt.title('Mean Squared Error by Model')
plt.ylabel('MSE')
plt.xlabel('Model')
plt.show()

# Plot MAE Scores
plt.figure(figsize=(10, 6))
sns.barplot(x=metrics_df.index, y='MAE', data=metrics_df, palette='viridis')
plt.title('Mean Absolute Error by Model')
plt.ylabel('MAE')
plt.xlabel('Model')
plt.show()

# ### Step 12: Validate the Best Model
# Test the best model on the test set and visualize its predictions against actual values.
best_model.fit(X_train, y_train)
best_predictions = best_model.predict(X_test)

# Scatter Plot of Predictions vs Actual Values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, best_predictions, alpha=0.7, label='Predictions')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Ideal Fit')
plt.xlabel('Actual Ratings')
plt.ylabel('Predicted Ratings')
plt.title(f'{best_model_name} Predictions vs Actuals')
plt.legend()
plt.show()

# ### Step 13: Save Predictions and Model
# Save the predictions to a CSV file and the best model to a file for future use.
output = pd.DataFrame({'Actual': y_test, 'Predicted': best_predictions})
output.to_csv('predictions.csv', index=False)

import joblib
joblib.dump(best_model, 'best_model.pkl')

# ### Summary
# This notebook demonstrates a full pipeline for predicting `audience_rating` using multiple models. Model performances are compared using metrics and visualizations, the best model is selected, and its predictions are validated. The predictions and trained model are saved for future use.
