In [None]:
import pandas as pd
import numpy as np

In [None]:
# read in the data
df = pd.read_csv('data/tmdb_movies_data.csv')
df

In [None]:
# printing data with all columns
pd.set_option('display.max_columns', None)
df.head()

In [None]:
df.info()

In [None]:
df.describe()


In [None]:
# handle null values
df.isnull().sum()

In [None]:
# dropping imdb_id, homepage, tagline, overview, and production_companies columns
df.drop(['imdb_id', 'homepage', 'tagline', 'overview', 'production_companies'], axis=1, inplace=True)

In [None]:
# drop budget and revenue columns 
df.drop('budget', axis=1, inplace=True)
df.drop('revenue', axis=1, inplace=True)

In [None]:
# dropping rows with null values
df.dropna(inplace=True)

In [None]:
df.head()

In [None]:
# Replacing | with " " in cast, genres, keywords
df['cast'] = df['cast'].str.replace('|', ' ')
df['genres'] = df['genres'].str.replace('|', ' ')
df['keywords'] = df['keywords'].str.replace('|', ' ')


In [None]:
df.head()

In [None]:
# check duplicates
dup = df.duplicated().sum()
print('Number of duplicates: ', dup)


In [None]:
# remove duplicates
df.drop_duplicates(inplace=True)

In [None]:
# Feature Engineering
# create new column for profit
df['profit'] = df['revenue_adj'] - df['budget_adj']
df.head()

# create new column for release month
df['release_month'] = pd.to_datetime(df['release_date']).dt.month
# create new column for release quarter



In [None]:
df.describe()

In [None]:
# replace all nan values with 0 for numeric columns and 'None' for string columns
numerical_features  = ['runtime', 'budget_adj', 'revenue_adj', 'vote_count', 'vote_average']
categorical_features = ['cast', 'director', 'genres', 'keywords', 'release_date', 'release_month']

for col in numerical_features:
    df[col].fillna(0, inplace=True)

for col in categorical_features:
    df[col].fillna('None', inplace=True)


In [None]:
# display nan values
df.isnull().sum()


In [None]:
# make title lowercase
df['original_title'] = df['original_title'].str.lower()
df

In [None]:
import regex as re

# remove all non-alphanumeric characters
df['original_title'] = df['original_title'].apply(lambda x: re.sub(r'\W+', ' ', x))
df

In [None]:
# save the df as df.csv
df.to_csv('data/df_v2.csv', index=False)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# draw subplots for all numerical features
fig, axes = plt.subplots(3, 2, figsize=(15, 10))
fig.suptitle('Distribution of Numerical Features')

sns.histplot(ax=axes[0, 0], x='runtime', data=df, kde=True)
axes[0, 0].set_title('Runtime')

sns.histplot(ax=axes[0, 1], x='vote_average', data=df, kde=True)
axes[0, 1].set_title('Vote Average')

sns.histplot(ax=axes[1, 0], x='vote_count', data=df, kde=True)
axes[1, 0].set_title('Vote Count')

sns.histplot(ax=axes[1, 1], x='release_year', data=df, kde=True)
axes[1, 1].set_title('Release Year')

sns.histplot(ax=axes[2, 0], x='budget_adj', data=df, kde=True)
axes[2, 0].set_title('Budget')

sns.histplot(ax=axes[2, 1], x='revenue_adj', data=df, kde=True)
axes[2, 1].set_title('Revenue')

plt.show()

In [None]:
# draw heatmap without id, imdb_id, releaes_year
plt.figure(figsize=(10, 10))
# sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
# plt.show()

In [None]:
# Feature Engineering
sns.pairplot(df[['popularity', 'budget_adj', 'revenue_adj', 'runtime', 'vote_count']])
plt.suptitle('Pairplot of Numerical Features', y=1.02)
plt.show()


In [None]:
# looking for outliers with subplots
fig, axes = plt.subplots(4, 2, figsize=(15, 10))
fig.suptitle('Distribution of Numerical Features')

sns.boxplot(ax=axes[0, 0], x='runtime', data=df)
axes[0, 0].set_title('Runtime')

sns.boxplot(ax=axes[0, 1], x='vote_average', data=df)
axes[0, 1].set_title('Vote Average')

sns.boxplot(ax=axes[1, 0], x='vote_count', data=df)
axes[1, 0].set_title('Vote Count')

sns.boxplot(ax=axes[1, 1], x='release_year', data=df)
axes[1, 1].set_title('Release Year')

sns.boxplot(ax=axes[2, 0], x='budget_adj', data=df)
axes[2, 0].set_title('Budget')

sns.boxplot(ax=axes[2, 1], x='revenue_adj', data=df)
axes[2, 1].set_title('Revenue')

sns.boxplot(ax=axes[3, 0], x='popularity', data=df)
axes[3, 0].set_title('Popularity')

sns.boxplot(ax=axes[3, 1], x='profit', data=df)
axes[3, 1].set_title('Profit')
plt.show()


In [None]:
# plot release month wiht names
plt.figure(figsize=(10, 10))
sns.countplot(x='release_month', data=df)
plt.title('Number of Movies Released per Month')
plt.show()

In [None]:
df.head()


In [None]:
df.columns

In [None]:
df.info() 

In [None]:
# # drop popularity, vote_count, ,release_month, budget_adj, revenue_adj
# df.drop(['popularity', 'vote_count', 'release_month', 'budget_adj', 'revenue_adj'], axis=1, inplace=True)

In [None]:
df.describe()

In [None]:
# drop rows with runtime = 0 and runtime > 300
df.drop(df[(df['runtime'] < 91) | (df['runtime'] > 300)].index, inplace=True)

In [None]:
df.info()

In [None]:
# drop if vote count below 5.4
df.drop(df[df['vote_count'] < 5.4].index, inplace=True)

In [None]:
df.info()

In [None]:
# drop before 1960
df.drop(df[df['release_year'] < 1960].index, inplace=True)

In [None]:
# drop vote count and release month
df.drop('vote_count', axis=1, inplace=True)
df.drop('release_month', axis=1, inplace=True)
df.drop('id', axis=1, inplace=True)

In [None]:
df.describe()

In [None]:
# drop if budget_adj < 100000
df.drop(df[df['budget_adj'] < 1_00_000].index, inplace=True)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
# drop vote_average < 5.4
df.drop(df[df['vote_average'] < 5.4].index, inplace=True)

In [None]:
# looking for outliers with subplots
fig, axes = plt.subplots(4, 2, figsize=(15, 20))
fig.suptitle('Distribution of Numerical Features')

sns.boxplot(ax=axes[0, 0], x='runtime', data=df)
axes[0, 0].set_title('Runtime')

sns.boxplot(ax=axes[0, 1], x='vote_average', data=df)
axes[0, 1].set_title('Vote Average')

# sns.boxplot(ax=axes[1, 0], x='vote_count', data=df)
# axes[1, 0].set_title('Vote Count')

sns.boxplot(ax=axes[1, 1], x='release_year', data=df)
axes[1, 1].set_title('Release Year')

sns.boxplot(ax=axes[2, 0], x='budget_adj', data=df)
axes[2, 0].set_title('Budget')

sns.boxplot(ax=axes[2, 1], x='revenue_adj', data=df)
axes[2, 1].set_title('Revenue')

sns.boxplot(ax=axes[3, 0], x='popularity', data=df)
axes[3, 0].set_title('Popularity')

sns.boxplot(ax=axes[3, 1], x='profit', data=df)
axes[3, 1].set_title('Profit')
plt.show()


In [None]:
df.info()

In [None]:
# drop popularity, relase_date, budget_adj, revenue_adj, profit
df.drop(['popularity', 'release_date', 'budget_adj', 'revenue_adj', 'profit'], axis=1, inplace=True)
df.head()

In [None]:
# save as df3.csv
df.to_csv('data/df_v3.csv', index=False)
