In [None]:
import pandas as pd 
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np

import ast

In [None]:
# Reading and loading datasets
movie_df = pd.read_csv('movies.csv')

movie_df.info()

In [None]:
movie_df['genres'] = movie_df['genres'].apply(lambda x: x.split('-')[0] if isinstance(x, str) else x)

movie_df.to_csv('modified_file.csv', index=False)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

movie_df['genres'] = movie_df['genres'].fillna('')

movie_df['genres'] = movie_df['genres'].apply(lambda x: x.split(', ') if isinstance(x, str) else [])

all_genres = [genre for sublist in movie_df['genres'].tolist() for genre in sublist]
encoder = LabelEncoder()
encoder.fit(all_genres)

movie_df['Genres_Encoded'] = movie_df['genres'].apply(lambda x: encoder.transform([x])[0] if x else None)

In [None]:
movie_df.head()

In [None]:
plt.hist(movie_df['revenue'], bins=10)
plt.xlabel("Movie Ratings")
plt.show()

In [None]:
movie_df.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
numeric_df = movie_df.select_dtypes(include=['float64', 'int64'])
corr = numeric_df.corr()
print(corr)

In [None]:
plt.figure(figsize = (8,8))

sns.heatmap(corr, annot=True)
plt.show()

In [None]:
movie_df.isnull().sum()

In [None]:
import sklearn
from sklearn.model_selection import train_test_split

print(movie_df.columns.tolist())

X = movie_df[['title', 'genres','popularity','production_companies','release_date','budget', 'vote_average','runtime','vote_count', 'recommendations', 'Genres_Encoded']]
y = movie_df['revenue']
X.columns.tolist()
print(X.columns.tolist())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, shuffle=True, test_size=0.3)

In [None]:
categorical_columns = ['title', 'genres', 'production_companies', 'recommendations']

for col in categorical_columns:
    X_train[col] = X_train[col].fillna('unknown')
    X_test[col] = X_test[col].fillna('unknown')

X_train = X_train.select_dtypes(exclude=['object'])
X_test = X_test.select_dtypes(exclude=['object'])

columns_to_drop = ['title', 'genres', 'production_companies', 'release_date', 'recommendations']
X = X.drop(columns=columns_to_drop)

In [None]:
X.head()


In [None]:
# !pip install xgboost
from xgboost import XGBRegressor

xgb_model = XGBRegressor()

In [None]:
xgb_model.fit(X_train, y_train)

In [None]:
y_xgb_pred_test = xgb_model.predict(X_test)

In [None]:
xgb_pred_test_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_xgb_pred_test})

xgb_pred_test_df

In [None]:
fig= plt.figure(figsize=(8,8))
xgb_pred_test_df = xgb_pred_test_df.reset_index()
xgb_pred_test_df = xgb_pred_test_df.drop(['index'],axis=1)
plt.plot(xgb_pred_test_df[:50])
plt.legend(['Actual value','Predicted value'])
plt.show()

In [None]:
from sklearn.metrics import r2_score

score = r2_score(y_test, y_xgb_pred_test)

print("R^2 - {}%".format(round(score, 2) *100))

In [None]:
# We can build and score a model on multiple folds using cross-validation
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score


# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(xgb_model, X, y, scoring='r2', error_score='raise', cv=cv, n_jobs=-1, verbose=1)

#average of all the r2 scores across runs
print(scores.mean())

In [None]:
# determine hyperparameter available for tuning
xgb_model.get_params()

In [None]:
xgb_model_2 = XGBRegressor(
    gamma=1,
    learning_rate=0.001,
    max_depth=8,
    n_estimators=15000,
    n_jobs=15,
    objective='reg:squarederror',
    subsample=0.8,
    scale_pos_weight=0,
    reg_alpha=0,
    reg_lambda=1,
    verbosity=1)

xgb_model_2.fit(X_train, y_train)

#run the predictions on the training and testing data
y_xgb_2_pred_test = xgb_model_2.predict(X_test)

In [None]:
# compare the actual values (ie, target) with the values predicted by the model
xgb_2_pred_test_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_xgb_2_pred_test})

xgb_2_pred_test_df

In [None]:
fig= plt.figure(figsize=(8,8))
xgb_2_pred_test_df = xgb_2_pred_test_df.reset_index()
xgb_2_pred_test_df = xgb_2_pred_test_df.drop(['index'],axis=1)
plt.plot(xgb_2_pred_test_df[:50])
plt.legend(['Actual value','Predicted value'])

In [None]:
# Determine accuracy uisng 𝑅^2
r2_xgb_model_2_test = xgb_model_2.score(X_test, y_test)

print("R^2 Test: {}".format(r2_xgb_model_2_test, 2))

In [None]:
import pandas as pd

# Example: Let's assume these are the features your model expects (you can modify this based on your actual features)
# For example, this could include budget, genre, and release year (but you need to match the features your model was trained on)

# Step 1: Prepare the input data for the movie you want to predict the revenue for
movie_data = {
    'popularity': [6500.123],         # Popularity score of the movie
    'budget': [120000000.0],          # Budget in dollars
    'vote_average': [7.85],           # Average user rating (e.g., 7.85/10)
    'runtime': [140.0],               # Runtime in minutes
    'vote_count': [2500.0],           # Total number of votes
    'Genres_Encoded': [3],
}

# Step 2: Convert the input data into a DataFrame (matching the format your model was trained on)
movie_temp_df = pd.DataFrame(movie_data)

# Step 3: Use the trained XGBoost model to predict the revenue
predicted_revenue = xgb_model.predict(movie_temp_df)

# Step 4: Print the predicted revenue
print(f"Predicted revenue for the movie: ${predicted_revenue[0]:,.2f}")
