In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from statistics import mean, median, mode, stdev

In [2]:
Movie = pd.read_csv('movie_collection_data.csv',na_filter=True, na_values='[]')


# Dropping invalid Data

In [9]:
## Drop INVALID DATA
Movie_CleanedData = Movie[~(Movie['Budget']<281)]
Movie_CleanedData = Movie_CleanedData[~(Movie_CleanedData['Revenue']==0)]
Movie_CleanedData = Movie_CleanedData.dropna(subset=['ReleaseDate','SpokenLanguage','ProductionCompany','ProductionCountry','GenreList'])
Movie_CleanedData.drop_duplicates(subset="Movie_name",keep=False, inplace =True)
Movie_CleanedData.index = pd.RangeIndex(len(Movie_CleanedData.index))
Movie_CleanedData.index = range(len(Movie_CleanedData.index))
Movie_CleanedData.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4111 entries, 0 to 4110
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Movie_name         4111 non-null   object 
 1   Adult              4111 non-null   bool   
 2   GenreList          4111 non-null   object 
 3   Budget             4111 non-null   int64  
 4   Popularity         4111 non-null   float64
 5   VoteAvg            4111 non-null   float64
 6   VoteCount          4111 non-null   int64  
 7   Overview           4111 non-null   object 
 8   Tagline            3845 non-null   object 
 9   Revenue            4111 non-null   int64  
 10  ReleaseDate        4111 non-null   object 
 11  Runtime            4111 non-null   float64
 12  OriginalLanguage   4111 non-null   object 
 13  ProductionCompany  4111 non-null   object 
 14  ProductionCountry  4111 non-null   object 
 15  SpokenLanguage     4111 non-null   object 
dtypes: bool(1), float64(3), 

## Calculate and Insert Profit Variable

In [None]:

Profit=[]
for i in range(len(Movie_CleanedData)):
    Profit.append(Movie_CleanedData.iloc[i,3]-Movie_CleanedData.iloc[i,9])
Movie_CleanedData.insert(10,"Profit",Profit,True)

## Create new column for release year and month

In [None]:

Movie_CleanedData['Year'] = pd.DatetimeIndex(Movie_CleanedData['ReleaseDate']).year
Movie_CleanedData['Month'] = pd.DatetimeIndex(Movie_CleanedData['ReleaseDate']).month

In [None]:
Movie_CleanedData.describe()

## Determine outliers


In [None]:

Q1 = (int)(Movie_CleanedData.VoteCount.quantile(0.25)) 
Q3 = (int)(Movie_CleanedData.VoteCount.quantile(0.75))
median = (int)(Movie_CleanedData.VoteCount.quantile(0.5))
median_avg = (int)(Movie_CleanedData.VoteAvg.quantile(0.5))
IQR = Q3 - Q1

for i in range(len(Movie_CleanedData)): 
     if Movie_CleanedData.loc[i,'VoteCount']>(Q3 + 1.5 * IQR) or Movie_CleanedData.loc[i,'VoteCount']<(Q1 - 1.5 * IQR):
        Movie_CleanedData.loc[i,'VoteCount']=median
        Movie_CleanedData.loc[i,'VoteAvg'] = median_avg


## EXPLORATORY ANALYSIS

In [None]:
Movie_CleanedData.describe()

In [None]:
NumericData =pd.DataFrame(Movie_CleanedData[['Budget','Popularity','VoteAvg','VoteCount','Revenue','Profit','Runtime','Year','Month']])
NumericData.head()

In [None]:
f, axes = plt.subplots(9, 3, figsize=(36, 40))

count = 0
for var in NumericData:
    sb.boxplot(data = NumericData[var], orient = "h", ax = axes[count,0])
    sb.histplot(data = NumericData[var], ax = axes[count,1])
    sb.violinplot(data = NumericData[var], orient = "h", ax = axes[count,2])
    count += 1

In [None]:

# Correlation Matrix
print(NumericData.corr())

# Heatmap of the Correlation Matrix
f = plt.figure(figsize=(12, 12))
sb.heatmap(NumericData.corr(), vmin = -1, vmax = 1, linewidths = 1,
           annot = True, fmt = ".2f", annot_kws = {"size": 18}, cmap = "RdBu")

In [None]:
CategoricalData = pd.DataFrame(Movie_CleanedData[['Adult','GenreList','OriginalLanguage','ProductionCompany','ProductionCountry','SpokenLanguage']])

In [None]:

sb.pairplot(data = CategoricalData)

CategoricalData = pd.CategoricalData(Movie_CleanedData[['Adult','GenreList','OriginalLanguage','ProductionCompany','ProductionCountry','SpokenLanguage']])
CategoricalData.head()

CategoricalData['MSSubClass'] = CategoricalData['MSSubClass'].astype('category')
CategoricalData['Neighborhood'] = CategoricalData['Neighborhood'].astype('category')
CategoricalData['BldgType'] = CategoricalData['BldgType'].astype('category')
CategoricalData['OverallQual'] = CategoricalData['OverallQual'].astype('category')