In [3]:
# Recreate the Jupyter notebook with the same process as before

from nbformat import v4 as nbf

# Create a new Jupyter Notebook
nb = nbf.new_notebook()

# List of cells to add to the notebook
cells = []

# 1. Title and Introduction
cells.append(nbf.new_markdown_cell("# 🎉 Building and Validating a Predictive Model for Audience Ratings"))
cells.append(nbf.new_markdown_cell("""
## **Project Overview**
**Objective**:  
The goal of this project is to predict the **'audience_rating'** of movies using various machine learning models. The process involves handling missing data, training models, comparing performance, and selecting the best-performing model.

**Steps to Follow**:
1. **Data Cleaning**: Handle missing values in 'audience_rating'.  
2. **Data Preparation**: Feature selection, encoding, and splitting data into training and testing sets.  
3. **Model Training**: Train multiple machine learning models.  
4. **Prediction & Evaluation**: Predict and evaluate the model on unseen test data.  
5. **Visualization**: Show the performance metrics before and after predictions.  
6. **Saving & Reuse**: Save the best model and load it for future use.  
"""))

# 2. Import Libraries
cells.append(nbf.new_code_cell("""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import time
"""))

# 3. Load Data
cells.append(nbf.new_code_cell("""
# Load the data
df = pd.read_excel('/mnt/data/Rotten_Tomatoes_Movies3.xlsx')  # Update with the correct path if needed
df.head()
"""))

# 4. Data Cleaning
cells.append(nbf.new_markdown_cell("## **Data Cleaning**"))
cells.append(nbf.new_code_cell("""
# Check for NaN values
print("Number of NaN values in 'audience_rating':", df['audience_rating'].isnull().sum())

# Option 1: Replace NaN with the mean value
df_mean_imputed = df.copy()
df_mean_imputed['audience_rating'].fillna(df_mean_imputed['audience_rating'].mean(), inplace=True)

# Option 2: Replace NaN with 'tomatometer_rating'
df_tomato_imputed = df.copy()
df_tomato_imputed['audience_rating'].fillna(df_tomato_imputed['tomatometer_rating'], inplace=True)
"""))

# 5. Data Preparation
cells.append(nbf.new_markdown_cell("## **Data Preparation**"))
cells.append(nbf.new_code_cell("""
# Select features and target
features = ['tomatometer_rating', 'genre', 'runtime', 'release_year', 'director']  # Example features
target = 'audience_rating'

# One-hot encode categorical features
df_encoded = pd.get_dummies(df, columns=['genre', 'director'])

# Split data into train and test sets
X = df_encoded.drop(columns=[target])
y = df_encoded[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
"""))

# 6. Model Training
cells.append(nbf.new_markdown_cell("## **Model Training**"))
cells.append(nbf.new_code_cell("""
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(n_estimators=100)
}

results = []

for model_name, model in models.items():
    print(f"Training {model_name}...")
    start_time = time.time()
    
    # Train on training data
    model.fit(X_train, y_train)
    
    training_time = time.time() - start_time
    
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'Model': model_name,
        'MSE': mse,
        'MAE': mae,
        'R2': r2,
        'Training Time': training_time
    })
"""))

# 7. Evaluation
cells.append(nbf.new_markdown_cell("## **Evaluation & Metrics**"))
cells.append(nbf.new_code_cell("""
results_df = pd.DataFrame(results)

# Plot the metrics for all models
results_df.plot(kind='bar', x='Model', y=['MSE', 'MAE', 'R2'], figsize=(12, 6))
plt.title('Performance of Models on Test Data')
plt.show()
"""))

# 8. Prediction
cells.append(nbf.new_markdown_cell("## **Prediction After Training**"))
cells.append(nbf.new_code_cell("""
best_model = RandomForestRegressor(n_estimators=100)
best_model.fit(X_train, y_train)

# Predict on the test set
y_test_pred = best_model.predict(X_test)

plt.figure(figsize=(10, 6))
plt.scatter(range(len(y_test)), y_test, color='blue', label='Actual')
plt.scatter(range(len(y_test_pred)), y_test_pred, color='red', label='Predicted')
plt.title('Actual vs. Predicted Audience Ratings')
plt.xlabel('Sample Index')
plt.ylabel('Audience Rating')
plt.legend()
plt.show()
"""))

# 9. Save & Reload Model
cells.append(nbf.new_markdown_cell("## **Save & Reload Best Model**"))
cells.append(nbf.new_code_cell("""
joblib.dump(best_model, 'best_audience_rating_model.pkl')

# Reload the saved model
loaded_model = joblib.load('best_audience_rating_model.pkl')

sample_data = X_test.iloc[0].values.reshape(1, -1)  # Single sample for prediction
predicted_rating = loaded_model.predict(sample_data)
print(f"Predicted Audience Rating: {predicted_rating[0]:.2f}")
"""))

# 10. Conclusion
cells.append(nbf.new_markdown_cell("## **Conclusion**"))
cells.append(nbf.new_markdown_cell("""
- We handled missing data using two imputation strategies.  
- We trained multiple models and selected the best-performing one.  
- We saved and reloaded the best model for future predictions.  
- We visualized predictions and performance metrics to understand the quality of the model.  
"""))

# Create the notebook
nb['cells'] = cells

# Save the notebook as a .ipynb file
notebook_path = 'audience_rating_prediction.ipynb'
with open(notebook_path, 'w', encoding='utf-8') as f:
    nbf.write(nb, f)

notebook_path


AttributeError: module 'nbformat.v4' has no attribute 'write'

In [11]:
pip install nbformat nbclient


