<a href="https://colab.research.google.com/github/KarakaCharmi/INTERNORBIT/blob/main/PREDICTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install and import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
#Loading the Dataset
df = pd.read_csv("/content/IMDb Movies India.csv",encoding='ISO-8859-1')
print(df.head())
print(df.info())



In [44]:
df.describe()


Unnamed: 0,Year,Duration,Rating,Votes,Num_Actors,Director_Freq,Duration_x_Num_Actors,Votes_x_DirectorFreq,Actor1_Freq,Actor2_Freq,Actor3_Freq,Total_Actor_Freq
count,7812.0,7812.0,7812.0,7812.0,7812.0,7812.0,7812.0,7812.0,7812.0,7812.0,7812.0,7812.0
mean,1993.797363,132.699053,5.839196,4.654525,2.932156,0.001087,390.982335,0.005262,28.850614,12.392601,9.508449,50.751664
std,20.000142,22.935046,1.381314,2.203758,0.401004,0.001238,85.779098,0.007087,39.574168,15.470767,12.478677,54.43654
min,1917.0,21.0,1.1,1.791759,0.0,0.000128,0.0,0.000229,0.0,0.0,0.0,0.0
25%,1980.0,123.0,4.9,2.890372,3.0,0.000128,366.0,0.000779,2.0,1.0,1.0,9.0
50%,1997.0,134.0,6.0,4.060443,3.0,0.00064,402.0,0.002538,9.0,5.0,4.0,29.0
75%,2011.0,143.0,6.8,6.042633,3.0,0.001536,429.0,0.006933,39.0,18.0,13.0,77.0
max,2021.0,321.0,10.0,13.290278,3.0,0.005888,963.0,0.065503,140.0,79.0,74.0,279.0


In [None]:
#Count of missing values column wise
print(df.isnull().sum())

In [None]:
#Data Cleaning
# Drop rows with missing values in key columns
df.dropna(subset=['Genre', 'Director', 'Rating'], inplace=True)

# Clean 'Year' column
df['Year'] = df['Year'].astype(str).str.extract(r'(\d{4})').astype(float)

# Clean 'Duration' column
df['Duration'] = df['Duration'].astype(str).str.replace('min', '', regex=False).astype(float)
median_duration = df['Duration'].median()
df['Duration'] = df['Duration'].fillna(median_duration)
print(df['Duration'].isnull().sum())

# Simplify Genre
df['Main_Genre'] = df['Genre'].astype(str).apply(lambda x: x.split(',')[0].strip())
# Combine Actors into one column
df['All_Actors'] = df[['Actor 1', 'Actor 2', 'Actor 3']].fillna('').agg(','.join, axis=1)

# Count number of actors
df['Num_Actors'] = df['All_Actors'].apply(lambda x: len([a for a in x.split(',') if a.strip() != '']))

df['Votes'] = df['Votes'].astype(str).str.replace(',', '', regex=False)
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

# Step 2: Fill missing values with median
median_votes = df['Votes'].median()
df['Votes'] = df['Votes'].fillna(median_votes)
df['Votes'] = np.log1p(df['Votes'])





In [None]:
df.info()
print(df.isnull().sum())

In [None]:
# One-hot encode the main genre
genre_dummies = pd.get_dummies(df['Main_Genre'], prefix='Genre')
df = pd.concat([df, genre_dummies], axis=1)
print(df.head(10))

In [None]:
# Get all columns that start with 'Genre_'
genre_columns = [col for col in df.columns if col.startswith('Genre_')]

# Print the genre columns names
print("Genre columns:", genre_columns)

# Print first 10 rows of only these genre columns
print(df[genre_columns].head(10))


In [None]:
#Feature Engineering
# Calculate actor frequency for each actor column
actor1_freq = df['Actor 1'].value_counts()
actor2_freq = df['Actor 2'].value_counts()
actor3_freq = df['Actor 3'].value_counts()

# Map these frequencies back to the DataFrame, filling missing actors with 0 frequency
df['Actor1_Freq'] = df['Actor 1'].map(actor1_freq).fillna(0)
df['Actor2_Freq'] = df['Actor 2'].map(actor2_freq).fillna(0)
df['Actor3_Freq'] = df['Actor 3'].map(actor3_freq).fillna(0)

# Optionally, you can create a combined actor frequency by summing them
df['Total_Actor_Freq'] = df['Actor1_Freq'] + df['Actor2_Freq'] + df['Actor3_Freq']


director_freq = df['Director'].value_counts(normalize=True)
df['Director_Freq'] = df['Director'].map(director_freq)
print(df[['Genre','Director_Freq']])
df['Duration_x_Num_Actors'] = df['Duration'] * df['Num_Actors']
df['Votes_x_DirectorFreq'] = df['Votes'] * df['Director_Freq']

In [None]:
#Prepare Data for Modeling
features = ['Year', 'Duration', 'Num_Actors', 'Director_Freq','Votes','Duration_x_Num_Actors','Votes_x_DirectorFreq','Actor1_Freq','Actor2_Freq','Actor3_Freq'] + [col for col in df.columns if col.startswith('Genre_')]
X = df[features]
y = df['Rating']


In [None]:
#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#Training the model

model = LinearRegression()  # Create the model (robot)
model.fit(X_train, y_train)  # Train model on training data


In [40]:
#Linear Regression
y_pred = model.predict(X_test)  # Make predictions on test features

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

r2 = r2_score(y_test, y_pred)  # Calculate how well predictions match true ratings

print(f"RMSE: {rmse:.3f}")
print(f"R^2 Score: {r2:.3f}")


RMSE: 1.236
R^2 Score: 0.205


In [41]:
#Random Forest
rf_model = RandomForestRegressor(
    n_estimators=100, max_depth=10, random_state=42
)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = mse_rf ** 0.5
r2_rf = r2_score(y_test, y_pred_rf)

print(f'Random Forest RMSE: {rmse_rf:.3f}')
print(f'Random Forest R^2: {r2_rf:.3f}')


Random Forest RMSE: 1.085
Random Forest R^2: 0.388


In [42]:
from sklearn.neighbors import KNeighborsRegressor

knn_model = KNeighborsRegressor(n_neighbors=5)

knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn=mse_knn**0.5
r2_knn = r2_score(y_test, y_pred_knn)

print(f'KNN RMSE: {rmse_knn:.3f}')
print(f'KNN R^2: {r2_knn:.3f}')


KNN RMSE: 1.324
KNN R^2: 0.088


In [43]:
def evaluate_model(y_true, y_pred, name="Model"):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{name} RMSE: {rmse:.3f}")
    print(f"{name} R² Score: {r2:.3f}")
    return rmse, r2

evaluate_model(y_test, y_pred, "Linear Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")


Linear Regression RMSE: 1.236
Linear Regression R² Score: 0.205
Random Forest RMSE: 1.085
Random Forest R² Score: 0.388


(np.float64(1.0850179438171634), 0.38752061911158064)