<a href="https://colab.research.google.com/github/KarakaCharmi/INTERNORBIT/blob/main/PREDICTION.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
# Step 1: Install and import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [24]:
#Loading the Dataset
df = pd.read_csv("/content/IMDb Movies India.csv",encoding='ISO-8859-1')
print(df.head())
print(df.info())



                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  (2019)  109 min            Drama   
2                         #Homecoming  (2021)   90 min   Drama, Musical   
3                             #Yaaram  (2019)  110 min  Comedy, Romance   
4                   ...And Once Again  (2010)  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

           Actor 3  
0  Rajendra Bhatia  
1    Arvind Jangid  
2       Roy Angana  
3  Siddhant Kapoor  
4    

In [27]:
df.describe()


Unnamed: 0,Rating
count,7919.0
mean,5.841621
std,1.381777
min,1.1
25%,4.9
50%,6.0
75%,6.8
max,10.0


In [28]:
#Count of missing values column wise
print(df.isnull().sum())

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64


In [30]:
#Data Cleaning
# Drop rows with missing values in key columns
df.dropna(subset=['Genre', 'Director', 'Rating'], inplace=True)

# Clean 'Year' column
df['Year'] = df['Year'].astype(str).str.extract(r'(\d{4})').astype(float)

# Clean 'Duration' column
df['Duration'] = df['Duration'].astype(str).str.replace('min', '', regex=False).astype(float)
median_duration = df['Duration'].median()
df['Duration'] = df['Duration'].fillna(median_duration)
print(df['Duration'].isnull().sum())

# Simplify Genre
df['Main_Genre'] = df['Genre'].astype(str).apply(lambda x: x.split(',')[0].strip())
# Combine Actors into one column
df['All_Actors'] = df[['Actor 1', 'Actor 2', 'Actor 3']].fillna('').agg(','.join, axis=1)

# Count number of actors
df['Num_Actors'] = df['All_Actors'].apply(lambda x: len([a for a in x.split(',') if a.strip() != '']))

df['Votes'] = df['Votes'].astype(str).str.replace(',', '', regex=False)
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

# Step 2: Fill missing values with median
median_votes = df['Votes'].median()
df['Votes'] = df['Votes'].fillna(median_votes)
df['Votes'] = np.log1p(df['Votes'])





0


In [31]:
df.info()
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Index: 7812 entries, 1 to 15508
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        7812 non-null   object 
 1   Year        7812 non-null   float64
 2   Duration    7812 non-null   float64
 3   Genre       7812 non-null   object 
 4   Rating      7812 non-null   float64
 5   Votes       7812 non-null   float64
 6   Director    7812 non-null   object 
 7   Actor 1     7707 non-null   object 
 8   Actor 2     7641 non-null   object 
 9   Actor 3     7558 non-null   object 
 10  Main_Genre  7812 non-null   object 
 11  All_Actors  7812 non-null   object 
 12  Num_Actors  7812 non-null   int64  
dtypes: float64(4), int64(1), object(8)
memory usage: 854.4+ KB
Name            0
Year            0
Duration        0
Genre           0
Rating          0
Votes           0
Director        0
Actor 1       105
Actor 2       171
Actor 3       254
Main_Genre      0
All_Actors      0
Nu

In [32]:
# One-hot encode the main genre
genre_dummies = pd.get_dummies(df['Main_Genre'], prefix='Genre')
df = pd.concat([df, genre_dummies], axis=1)
print(df.head(10))

                                  Name    Year  Duration  \
1   #Gadhvi (He thought he was Gandhi)  2019.0     109.0   
3                              #Yaaram  2019.0     110.0   
5                 ...Aur Pyaar Ho Gaya  1997.0     147.0   
6                            ...Yahaan  2005.0     142.0   
8                   ?: A Question Mark  2012.0      82.0   
9                             @Andheri  2014.0     116.0   
10           1:1.6 An Ode to Lost Love  2004.0      96.0   
11                1:13:7 Ek Tera Saath  2016.0     120.0   
12                            100 Days  1991.0     161.0   
13                           100% Love  2012.0     166.0   

                        Genre  Rating     Votes           Director  \
1                       Drama     7.0  2.197225      Gaurav Bakshi   
3             Comedy, Romance     4.4  3.583519         Ovais Khan   
5      Comedy, Drama, Musical     4.7  6.719013       Rahul Rawail   
6         Drama, Romance, War     7.4  6.991177     Shoojit

In [33]:
# Get all columns that start with 'Genre_'
genre_columns = [col for col in df.columns if col.startswith('Genre_')]

# Print the genre columns names
print("Genre columns:", genre_columns)

# Print first 10 rows of only these genre columns
print(df[genre_columns].head(10))


Genre columns: ['Genre_Action', 'Genre_Adventure', 'Genre_Animation', 'Genre_Biography', 'Genre_Comedy', 'Genre_Crime', 'Genre_Documentary', 'Genre_Drama', 'Genre_Family', 'Genre_Fantasy', 'Genre_History', 'Genre_Horror', 'Genre_Music', 'Genre_Musical', 'Genre_Mystery', 'Genre_Romance', 'Genre_Sci-Fi', 'Genre_Sport', 'Genre_Thriller', 'Genre_War']
    Genre_Action  Genre_Adventure  Genre_Animation  Genre_Biography  \
1          False            False            False            False   
3          False            False            False            False   
5          False            False            False            False   
6          False            False            False            False   
8          False            False            False            False   
9           True            False            False            False   
10         False            False            False            False   
11         False            False            False            False   
12         

In [35]:
#Feature Engineering
# Calculate actor frequency for each actor column
actor1_freq = df['Actor 1'].value_counts()
actor2_freq = df['Actor 2'].value_counts()
actor3_freq = df['Actor 3'].value_counts()

# Map these frequencies back to the DataFrame, filling missing actors with 0 frequency
df['Actor1_Freq'] = df['Actor 1'].map(actor1_freq).fillna(0)
df['Actor2_Freq'] = df['Actor 2'].map(actor2_freq).fillna(0)
df['Actor3_Freq'] = df['Actor 3'].map(actor3_freq).fillna(0)

# Optionally, you can create a combined actor frequency by summing them
df['Total_Actor_Freq'] = df['Actor1_Freq'] + df['Actor2_Freq'] + df['Actor3_Freq']


director_freq = df['Director'].value_counts(normalize=True)
df['Director_Freq'] = df['Director'].map(director_freq)
print(df[['Genre','Director_Freq']])
df['Duration_x_Num_Actors'] = df['Duration'] * df['Num_Actors']
df['Votes_x_DirectorFreq'] = df['Votes'] * df['Director_Freq']

                           Genre  Director_Freq
1                          Drama       0.000128
3                Comedy, Romance       0.000128
5         Comedy, Drama, Musical       0.002176
6            Drama, Romance, War       0.000896
8      Horror, Mystery, Thriller       0.000128
...                          ...            ...
15501       Action, Crime, Drama       0.000896
15503       Action, Crime, Drama       0.001792
15504                     Action       0.000512
15505              Action, Drama       0.001024
15508              Action, Drama       0.002304

[7812 rows x 2 columns]


In [15]:
#Prepare Data for Modeling
features = ['Year', 'Duration', 'Num_Actors', 'Director_Freq','Votes','Duration_x_Num_Actors','Votes_x_DirectorFreq','Actor1_Freq','Actor2_Freq','Actor3_Freq'] + [col for col in df.columns if col.startswith('Genre_')]
X = df[features]
y = df['Rating']


In [36]:
#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
#Training the model

model = LinearRegression()  # Create the model (robot)
model.fit(X_train, y_train)  # Train model on training data


In [18]:
#Linear Regression
y_pred = model.predict(X_test)  # Make predictions on test features

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

r2 = r2_score(y_test, y_pred)  # Calculate how well predictions match true ratings

print(f"RMSE: {rmse:.3f}")
print(f"R^2 Score: {r2:.3f}")


RMSE: 1.236
R^2 Score: 0.205


In [19]:
#Random Forest
rf_model = RandomForestRegressor(
    n_estimators=100, max_depth=10, random_state=42
)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = mse_rf ** 0.5
r2_rf = r2_score(y_test, y_pred_rf)

print(f'Random Forest RMSE: {rmse_rf:.3f}')
print(f'Random Forest R^2: {r2_rf:.3f}')


Random Forest RMSE: 1.085
Random Forest R^2: 0.388


In [20]:
from sklearn.neighbors import KNeighborsRegressor

knn_model = KNeighborsRegressor(n_neighbors=5)

knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)

mse_knn = mean_squared_error(y_test, y_pred_knn)
rmse_knn=mse_knn**0.5
r2_knn = r2_score(y_test, y_pred_knn)

print(f'KNN RMSE: {rmse_knn:.3f}')
print(f'KNN R^2: {r2_knn:.3f}')


KNN RMSE: 1.324
KNN R^2: 0.088


In [38]:
def evaluate_model(y_true, y_pred, name="Model"):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"{name} RMSE: {rmse:.3f}")
    print(f"{name} R² Score: {r2:.3f}")
    return rmse, r2

evaluate_model(y_test, y_pred, "Linear Regression")
evaluate_model(y_test, y_pred_rf, "Random Forest")


Linear Regression RMSE: 1.236
Linear Regression R² Score: 0.205
Random Forest RMSE: 1.085
Random Forest R² Score: 0.388


(np.float64(1.0850179438171634), 0.38752061911158064)