In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load dataset
url = R"C:\Users\madhe\Downloads\IMDb Movies India.csv.zip"
df = pd.read_csv(url, encoding='ISO-8859-1')  # Use appropriate encoding if needed

# Check the column names in the dataset
print("Columns in the dataset:", df.columns)

# Rename columns to standardize (optional) - Convert column names to lowercase and strip spaces
df.columns = df.columns.str.strip().str.lower()

# Check if 'actors' or similar column exists and adjust accordingly
if 'actors' in df.columns:
    actor_col = 'actors'
elif 'cast' in df.columns:
    actor_col = 'cast'
else:
    print("The dataset does not contain an 'actors' or 'cast' column.")
    actor_col = None

# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

# Drop rows with missing ratings (this is the target variable)
df = df.dropna(subset=['rating'])  # Ensure 'rating' exists; adjust if needed

# Fill missing values in categorical columns (e.g., genre, director, actors/cast)
df['genre'] = df['genre'].fillna('Unknown')
df['director'] = df['director'].fillna('Unknown')
if actor_col:
    df[actor_col] = df[actor_col].fillna('Unknown')

# Encode categorical variables (genre, director, actors/cast) using LabelEncoder
label_encoder = LabelEncoder()

df['genre_encoded'] = label_encoder.fit_transform(df['genre'])
df['director_encoded'] = label_encoder.fit_transform(df['director'])
if actor_col:
    df['actors_encoded'] = label_encoder.fit_transform(df[actor_col])

# Select relevant features for the model
features = ['genre_encoded', 'director_encoded']
if actor_col:
    features.append('actors_encoded')
    
X = df[features]
y = df['rating']  # The target variable is 'rating'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a regression model (Linear Regression)
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")

# Optional: Show actual vs predicted ratings
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(comparison_df.head())


Columns in the dataset: Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')
The dataset does not contain an 'actors' or 'cast' column.
Missing values in each column:
name           0
year         528
duration    8269
genre       1877
rating      7590
votes       7589
director     525
actor 1     1617
actor 2     2384
actor 3     3144
dtype: int64
Mean Squared Error: 1.8157420445984143
Root Mean Squared Error: 1.347494728968694
       Actual  Predicted
9456      3.3   6.096065
14816     5.3   5.594187
3213      5.7   5.603116
3778      7.2   5.921722
5775      3.5   6.028224
