In [2]:
import pandas as pd

# Load the dataset
file_path = r"C:\Users\User\Downloads\Anaconda\IMDb Movies India.csv"
df = pd.read_csv(file_path, encoding='latin1')

# Drop rows where 'Name' is missing (crucial column)
df = df[df['Name'].notna()]

# Clean and convert 'Year'
df['Year'] = df['Year'].str.extract(r'(\d{4})')  # Extract year from formats like '(2020)'
df['Year'] = pd.to_numeric(df['Year'], errors='coerce')  # Convert to numeric

# Clean and convert 'Duration' to integer minutes
df['Duration'] = df['Duration'].str.extract(r'(\d+)')  # Extract digits
df['Duration'] = pd.to_numeric(df['Duration'], errors='coerce')

# Clean 'Votes': remove commas and convert to numeric
df['Votes'] = df['Votes'].str.replace(',', '', regex=False)
df['Votes'] = pd.to_numeric(df['Votes'], errors='coerce')

# Drop duplicates (if any)
df.drop_duplicates(inplace=True)

# Optionally fill missing values for categorical fields with "Unknown"
for col in ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']:
    df[col] = df[col].fillna("Unknown")

# Show summary of cleaned data
print("Cleaned Dataset Overview:")
print(df.info())
print(df.head())


Cleaned Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
Index: 15503 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15503 non-null  object 
 1   Year      14976 non-null  float64
 2   Duration  7239 non-null   float64
 3   Genre     15503 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7919 non-null   float64
 6   Director  15503 non-null  object 
 7   Actor 1   15503 non-null  object 
 8   Actor 2   15503 non-null  object 
 9   Actor 3   15503 non-null  object 
dtypes: float64(4), object(6)
memory usage: 1.3+ MB
None
                                 Name    Year  Duration            Genre  \
0                                         NaN       NaN            Drama   
1  #Gadhvi (He thought he was Gandhi)  2019.0     109.0            Drama   
2                         #Homecoming  2021.0      90.0   Drama, Musical   
3                             #Yaaram  20

In [7]:
print("Missing in Duration (entire data):", df_model['Duration'].isna().sum(), "out of", len(df_model))
print("Duration - non-null count:\n", df_model['Duration'].notna().value_counts())
print("Sample Duration values:\n", df_model['Duration'].dropna().head())


Missing in Duration (entire data): 7919 out of 7919
Duration - non-null count:
 Duration
False    7919
Name: count, dtype: int64
Sample Duration values:
 Series([], Name: Duration, dtype: float64)


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load dataset
file_path = r"C:\Users\User\Downloads\Anaconda\IMDb Movies India.csv"
df = pd.read_csv(file_path, encoding='latin1')

# Drop rows where the target 'Rating' is missing
df_model = df.dropna(subset=['Rating']).copy()

# Function to limit categories to top N frequent
def limit_categories(series, top_n=50):
    top = series.value_counts().nlargest(top_n).index
    return series.where(series.isin(top), other='Other')

# Apply limiting to categorical columns
for col in ['Director', 'Actor 1', 'Actor 2', 'Actor 3']:
    df_model.loc[:, col] = limit_categories(df_model[col], top_n=50)

# Fill missing genres
df_model['Genre'] = df_model['Genre'].fillna("Unknown")

# Clean 'Votes'
df_model['Votes'] = df_model['Votes'].astype(str).str.replace(',', '', regex=False)
df_model['Votes'] = pd.to_numeric(df_model['Votes'], errors='coerce')

# Features & target — NOTE: Duration is removed
X = df_model[['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3', 'Votes']]
y = df_model['Rating']

# Columns
categorical_cols = ['Genre', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']
numerical_cols = ['Votes']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols),
        
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_cols)
    ]
)

# Full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("✅ Final Model Performance (without Duration):")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")


✅ Final Model Performance (without Duration):
RMSE: 1.26
R² Score: 0.15
