In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

In [None]:
df = pd.read_csv("IMDb Movies India.csv", encoding='latin-1')
df

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali
...,...,...,...,...,...,...,...,...,...,...
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,


In [None]:
def clean_year(year):
    if pd.notna(year):
        return int(''.join(filter(str.isdigit, str(year))))
    return np.nan

df['Year'] = df['Year'].apply(clean_year)

def clean_duration(duration):
    if pd.notna(duration):
        return int(''.join(filter(str.isdigit, str(duration).split()[0])))
    return np.nan

df['Duration'] = df['Duration'].apply(clean_duration)

In [None]:
num_cols = ['Year', 'Duration', 'Votes']
cat_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']

In [None]:
def clean_votes(votes):
    if pd.notna(votes):
        return int(''.join(filter(str.isdigit, str(votes))))
    return np.nan

df['Votes'] = df['Votes'].apply(clean_votes)

num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

In [None]:
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])


In [None]:
df['Actors'] = df[['Actor 1', 'Actor 2', 'Actor 3']].apply(lambda x: ' '.join(x.dropna()), axis=1)
df = df.drop(columns=['Actor 1', 'Actor 2', 'Actor 3'])

In [None]:
df = pd.get_dummies(df, columns=['Genre', 'Director', 'Actors'])

In [None]:
df['Rating'] = cat_imputer.fit_transform(df[['Rating']])

In [None]:
X = df.drop(columns=['Rating']).apply(pd.to_numeric, errors='coerce').fillna(0)
y = df['Rating'].astype(float)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import warnings
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

# Random Forest

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2_score = r2_score(y_test, y_pred)

print(f'Mean Absolute Error (MAE): {mae:.2f}')
print(f'Mean Squared Error (MSE): {mse:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
print(f'R-squared (R2): {r2_score:.2f}')

In [None]:
new_data = pd.DataFrame({
    'Year': [2022],
    'Duration': [120],
    'Votes': [1000],
    'Genre_Drama': [1],
    'Genre_Action': [0],
    'Director_SomeDirector': [1],
    'Actors_Actor1': [1],
    # Include all other necessary dummy variables here
})

# Ensure all columns match the training data
for col in X.columns:
    if col not in new_data.columns:
        new_data[col] = 0

new_data = new_data.reindex(columns=X_train.columns, fill_value=0)

# Predict the rating for the new data
new_prediction = model.predict(new_data)
print(f'Predicted Rating: {new_prediction[0]}')

Predicted Rating: 7.064000000000004


# Gradient boosting regressor
accuracy was low for random forest so we use gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
model2 = GradientBoostingRegressor(n_estimators=100, random_state=42)
model2.fit(X_train, y_train)

# Make predictions
y_pred2 = model2.predict(X_test)

In [None]:
mae2 = mean_absolute_error(y_test, y_pred2)
mse2 = mean_squared_error(y_test, y_pred2)
rmse2 = np.sqrt(mse2)
r2_2 = r2_score(y_test, y_pred2)

print(f'Mean Absolute Error (MAE): {mae2:.2f}')
print(f'Mean Squared Error (MSE): {mse2:.2f}')
print(f'Root Mean Squared Error (RMSE): {rmse2:.2f}')
print(f'R^2 Score: {r2_2:.2f}')

Mean Absolute Error (MAE): 0.50
Mean Squared Error (MSE): 0.70
Root Mean Squared Error (RMSE): 0.84
R^2 Score: 0.30


In [None]:
new_data2 = pd.DataFrame({
    'Year': [2022],
    'Duration': [120],
    'Votes': [1000],
    'Genre_Drama': [1],
    'Genre_Action': [0],
    'Director_SomeDirector': [1],
    'Actors_Actor1': [1],
    # Include all other necessary dummy variables here
})

# Ensure all columns match the training data
for col in X.columns:
    if col not in new_data2.columns:
        new_data2[col] = 0

new_data2 = new_data2.reindex(columns=X_train.columns, fill_value=0)

# Predict the rating for the new data
new_prediction2 = model2.predict(new_data2)
print(f'Predicted Rating: {new_prediction2[0]}')

Predicted Rating: 6.035704248977211


this also have low accuracy

# XGBoost

In [None]:
import xgboost as xgb

In [None]:
xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

In [None]:
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f'XGBoost Mean Absolute Error (MAE): {mae_xgb:.2f}')
print(f'XGBoost Mean Squared Error (MSE): {mse_xgb:.2f}')
print(f'XGBoost Root Mean Squared Error (RMSE): {rmse_xgb:.2f}')
print(f'XGBoost R^2 Score: {r2_xgb:.2f}')


XGBoost Mean Absolute Error (MAE): 0.44
XGBoost Mean Squared Error (MSE): 0.62
XGBoost Root Mean Squared Error (RMSE): 0.79
XGBoost R^2 Score: 0.38


In [None]:
new_data4 = pd.DataFrame({
    'Year': [2022],
    'Duration': [120],
    'Votes': [1000],
    'Genre_Drama': [1],
    'Genre_Action': [0],
    'Director_SomeDirector': [1],
    'Actors_Actor1': [1],
    # Include all other necessary dummy variables here
})

# Ensure all columns match the training data
for col in X.columns:
    if col not in new_data4.columns:
        new_data4[col] = 0

new_data4 = new_data4.reindex(columns=X_train.columns, fill_value=0)

# Predict the rating for the new data
new_prediction4 = xgb_model.predict(new_data4)
print(f'Predicted Rating: {new_prediction4[0]}')

Predicted Rating: 6.584532260894775
