## Introduction 

#### The aim of this project is to see what are the factors which contribute towards the total gross revenue for the movie. we will be using python to load the dataset, clean and Visualize it.

In [None]:


#import the packages we will use in this project

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)
pd.options.mode.chained_assignment = None
#read in the data
df = pd.read_csv('../input/movies/movies.csv')

In [None]:
#Take a look at the data

df.head()

In [None]:
df= df.dropna()

## Data Cleaning 



    
   

In [None]:
# We need to see if we have any missing data and remove if there any
# Let's loop through the data and see if there is anything missing

for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

In [None]:
# Data Types for our columns

print(df.dtypes)

In [None]:
# Problem: `year` column is inaccurate
# Solution: extract the correct year from `released` column

released_df = df['released'].str.split(' ',n = 3,expand = True)
released_df.rename(columns={0: 'month', 1: 'day', 2: 'year', 3: 'country'}, inplace = True)

# Add new columns to df
df['released_year'] = released_df['year']
df['released_month'] = released_df['month']
df['released_day'] = released_df['day']

# 'released_year' will replace 'year'
df.drop(['year'], axis = 1, inplace = True)

df.head()

In [None]:
# Change data type for some columns

df['budget'] = df['budget'].astype("Int64")
df['gross'] = df['gross'].astype("Int64")



In [None]:
df.sort_values(by=['gross'] , inplace = False, ascending = False)

In [None]:
pd.set_option('display.min_rows', None)

In [None]:
df.sort_values(by=['gross'] , inplace = False, ascending = False)

In [None]:
df['company'].drop_duplicates().sort_values(ascending=False)

In [None]:
# drop duplicates
df.drop_duplicates()

## Visualisation 




In [None]:
# budget vs gross scatter plot

df['gross'] = df['gross'].astype('float')
df['budget'] = df['budget'].astype('float')
plt.scatter(x=df['budget'], y=df['gross'])
plt.title('Budget vs Gross Earning')
plt.xlabel('Gross Earning')
plt.ylabel('Budget for Film')

plt.show()

In [None]:
df.head()

In [None]:
# Regression plot: how much is budget correlated to gross revenue?

sns.regplot(data = df, x = 'budget', y = 'gross', 
            scatter_kws = {'color': 'red'},
            line_kws = {'color': 'blue'})
plt.title('Budget vs Gross Revenue')
plt.xlabel('Budget')
plt.ylabel('Gross Revenue')
plt.show()

In [None]:
# Confirm high correlation b/w budget and gross (0.74)
df.corr(method = 'pearson')

In [None]:
# correlation b/w budget and gross with Heatmap 

correlation_matrix = df.corr(method = 'pearson')
sns.heatmap(correlation_matrix, annot = True)
plt.title('Correlation Matrix for Numeric Movie Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [None]:
# Change all column values to numbers in order to run a (bigger) correlation matrix
df_numerized = df

for col_name in df_numerized.columns:
    if(df_numerized[col_name].dtype == 'object'):
        df_numerized[col_name] = df_numerized[col_name].astype('category')
        df_numerized[col_name] = df_numerized[col_name].cat.codes
        
df_numerized

In [None]:
# correlation b/w budget and gross with Heatmap using new df

correlation_matrix = df_numerized.corr(method = 'pearson')
sns.heatmap(correlation_matrix, annot = True)
plt.title('Correlation Matrix for Numeric Movie Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [None]:
# see correlation  for each variable
correlation_mat = df_numerized.corr()
corr_pairs = correlation_mat.unstack()
corr_pairs.head()


In [None]:
correlation_pairs = correlation_mat.unstack()
sorted_pairs = correlation_pairs.sort_values()
sorted_pairs.head()

In [None]:
# Show only strong positive correlations
high_corr = sorted_pairs[(sorted_pairs) > 0.5]
high_corr.head()

## Conclusion

#### Number of ratings (votes) and budget have the highest correlation to gross earnings