# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
# Load the champion model
model_path = "C:/Users/Nandan Hegde/OneDrive/Documents/GitHub/MSU_CMSE_830_Final_Semester_project/Movie_analysis_dashboard/Model_outputs/champion_model.pkl"
pipeline = joblib.load(model_path)
print(f"Champion model loaded from: {model_path}")

# Load the cleaned dataset for inference
data_path = "C:/Users/Nandan Hegde/OneDrive/Documents/GitHub/MSU_CMSE_830_Final_Semester_project/Movie_analysis_dashboard/Interim_Data/Final_Cleaned_Data.pkl"
data = pd.read_pickle(data_path)

# Feature Engineering: Add profit margin and ROI (same as training)
data['profit_margin'] = (data['revenue'] - data['budget']) / data['revenue']
data['ROI'] = (data['revenue'] - data['budget']) / data['budget']
data['log_budget'] = np.log1p(data['budget'])

# Replace non-numeric values with NaN
for col in ['log_budget', 'avg_rating', 'rating_count', 'ROI']:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Replace infinity and NaN values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(subset=['log_budget', 'avg_rating', 'rating_count', 'ROI'], inplace=True)

# Select features for prediction
features = ['log_budget', 'avg_rating', 'rating_count', 'ROI']
categorical_features = ['genres_x']
X_inference = data[features + categorical_features]

# Check if categorical columns contain unexpected values
X_inference[categorical_features] = X_inference[categorical_features].fillna("Unknown")

# Make predictions
log_predictions = pipeline.predict(X_inference)

# Convert predictions back to the original scale (exponentiate log-transformed revenue)
predicted_revenue = np.expm1(log_predictions)

# Add predicted revenue to inference DataFrame
X_inference['predicted_revenue'] = predicted_revenue

# Add the actual revenue column for comparison
X_inference['actual_revenue'] = data['revenue']

# Save the inference results to a CSV file
output_path = "C:/Users/Nandan Hegde/OneDrive/Documents/GitHub/MSU_CMSE_830_Final_Semester_project/Movie_analysis_dashboard/Predictions/inference_results.csv"
X_inference.to_csv(output_path, index=False)
print(f"Inference results saved to: {output_path}")

# Preview the first few rows of the inference results
print(X_inference.head())

Champion model loaded from: C:/Users/Nandan Hegde/OneDrive/Documents/GitHub/MSU_CMSE_830_Final_Semester_project/Movie_analysis_dashboard/Model_outputs/champion_model.pkl


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_inference[categorical_features] = X_inference[categorical_features].fillna("Unknown")


Inference results saved to: C:/Users/Nandan Hegde/OneDrive/Documents/GitHub/MSU_CMSE_830_Final_Semester_project/Movie_analysis_dashboard/Predictions/inference_results.csv
   log_budget  avg_rating  rating_count        ROI  \
0   17.216708    3.920930         215.0  11.451801   
1   17.417237    3.431818         110.0   1.956026   
2   17.417237    3.259615          52.0   1.956026   
3   17.417237    2.357143           7.0   1.956026   
4   17.417237    3.071429          49.0   1.956026   

                                      genres_x  predicted_revenue  \
0  Adventure|Animation|Children|Comedy|Fantasy       3.667452e+08   
1                   Adventure|Children|Fantasy       1.083722e+08   
2                               Comedy|Romance       1.083722e+08   
3                         Comedy|Drama|Romance       1.083722e+08   
4                                       Comedy       1.083722e+08   

   actual_revenue  
0    3.735540e+08  
1    1.083722e+08  
2    1.083722e+08  
3    1.08

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_inference['predicted_revenue'] = predicted_revenue
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_inference['actual_revenue'] = data['revenue']
