In [1]:
import numpy as np
import pandas as pd
import warnings            
warnings.filterwarnings("ignore")

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
# Load the dataset
df = pd.read_csv("imdb_movie_dataset.csv")

In [4]:
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [5]:
df.shape

(1000, 12)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  872 non-null    float64
 11  Metascore           936 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 93.9+ KB


In [7]:
df.isnull().sum()

Rank                    0
Title                   0
Genre                   0
Description             0
Director                0
Actors                  0
Year                    0
Runtime (Minutes)       0
Rating                  0
Votes                   0
Revenue (Millions)    128
Metascore              64
dtype: int64

In [8]:
# Define the imputer to fill missing values with the mean
imputer = SimpleImputer(strategy='mean')

In [9]:
# Select only the columns that need imputation
columns_to_impute = ['Revenue (Millions)', 'Metascore']

In [10]:
# Apply the imputer to those columns
df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])

In [11]:
df.isnull().sum()

Rank                  0
Title                 0
Genre                 0
Description           0
Director              0
Actors                0
Year                  0
Runtime (Minutes)     0
Rating                0
Votes                 0
Revenue (Millions)    0
Metascore             0
dtype: int64

In [12]:
df.duplicated().sum()

0

In [None]:
# Correlation Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df[['Runtime (Minutes)', 'Rating', 'Votes', 'Metascore', 'Year', 'Revenue (Millions)']].corr(), 
            annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap of Numeric Features")
plt.tight_layout()
plt.show()

In [None]:
# Distribution of Revenue
plt.figure(figsize=(8, 5))
sns.histplot(df['Revenue (Millions)'], bins=5, kde=True, color='green')
plt.title("Distribution of Revenue (Millions)")
plt.xlabel("Revenue (Millions)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [16]:
# Feature Engineering: Extract main genre
df['MainGenre'] = df['Genre'].apply(lambda x: x.split(',')[0])

In [17]:
# Feature Engineering: Top 10 directors
top_directors = df['Director'].value_counts().nlargest(10).index
df['TopDirector'] = df['Director'].apply(lambda x: x if x in top_directors else 'Other')

In [18]:
# Feature Engineering: Top 10 actors (first actor in list)
df['LeadActor'] = df['Actors'].apply(lambda x: x.split(',')[0])
top_actors = df['LeadActor'].value_counts().nlargest(10).index
df['TopActor'] = df['LeadActor'].apply(lambda x: x if x in top_actors else 'Other')

In [19]:
# Select features
features = ['Runtime (Minutes)', 'Rating', 'Votes', 'Metascore', 'Year',
            'MainGenre', 'TopDirector', 'TopActor']

In [20]:
X = df[features]
y = df['Revenue (Millions)']

In [21]:
# Categorical & numerical features
categorical = ['MainGenre', 'TopDirector', 'TopActor']
numerical = ['Runtime (Minutes)', 'Rating', 'Votes', 'Metascore', 'Year']

In [22]:
# Preprocessing and modeling pipeline
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
], remainder='passthrough')

In [23]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Linear Regression

In [24]:
model_lr = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

In [25]:
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)

In [26]:
print("MAE:", mean_absolute_error(y_test, y_pred_lr))

MAE: 51.74939111799693


In [27]:
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))

RMSE: 69.16814483931276


In [28]:
print("R2 Score:", r2_score(y_test, y_pred_lr))

R2 Score: 0.48849012912743306


## Decision Tree Regressor

In [1]:
model_dt = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(random_state=42))
])

NameError: name 'Pipeline' is not defined

In [None]:
model_dt.fit(X_train, y_train)