In [None]:
# Import libraries
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

# Read in the data
df = pd.read_csv('movies.csv')


In [None]:
# Look at data
df.head()

In [None]:
# Look for missing data

for col in df.columns:
    pct_missing = np.mean(df[col].isnull()) * 100
    print('{} - {:.1f}%'.format(col,pct_missing))

# Drop all missing values
df = df.dropna()

# Check again for missing data
for col in df.columns:
    pct_missing = np.mean(df[col].isnull()) * 100
    print('{} - {:.1f}%'.format(col,pct_missing))

In [None]:
# Data types for columns

df.dtypes

In [None]:
# Converting unnecessary float to int

df['budget'] = df['budget'].astype('int64')
df['votes'] = df['votes'].astype('int64')
df['gross'] = df['gross'].astype('int64')

df

In [None]:
# Sort rows by gross income descending

df = df.sort_values(by=['gross'], inplace=False, ascending=False)

In [None]:
# Set option to view 50 rows at a time

pd.set_option('display.max_rows', 30)

In [None]:
# Drop any duplicate rows

df.drop_duplicates()

In [None]:
# Scatter plot with budget vs gross

plt.scatter(x=df['gross'], y=df['budget'])
plt.title('Budget vs Gross Earnings')
plt.xlabel('Gross Earnings')
plt.ylabel('Budget for Film')

plt.show

In [None]:
# Match correlation of graph with data

df.head()

In [None]:
# Plot budget vs gross with seaborn

sns.regplot(x='gross', y='budget', data=df, scatter_kws={"color": "red"}, line_kws={"color": "blue"})


In [None]:
# Start looking at the correlation

In [None]:
# We see highy correlation with gross and budget
# We also see votes and gross earnings also have some correlation

correlation_matrix = df.corr(numeric_only=True)
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Matrix for Numeric Features')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [None]:
df_numerized = df

for col_name in df_numerized.columns:
    if(df_numerized[col_name].dtype == 'object'):
        df_numerized[col_name] = df_numerized[col_name].astype('category')
        df_numerized[col_name] = df_numerized[col_name].cat.codes
df_numerized

In [None]:
correlation_mat = df_numerized.corr()
corr_pairs = correlation_mat.unstack()
corr_pairs

In [None]:
sorted_pairs = corr_pairs.sort_values()
sorted_pairs

In [240]:
high_corr = sorted_pairs[(sorted_pairs) > 0.5]
high_corr

votes     gross       0.614751
gross     votes       0.614751
          budget      0.740247
budget    gross       0.740247
name      name        1.000000
director  director    1.000000
gross     gross       1.000000
budget    budget      1.000000
country   country     1.000000
star      star        1.000000
writer    writer      1.000000
votes     votes       1.000000
score     score       1.000000
released  released    1.000000
year      year        1.000000
genre     genre       1.000000
rating    rating      1.000000
company   company     1.000000
runtime   runtime     1.000000
dtype: float64

In [None]:
# We see that votes and budget are confirmed to have the highest correlation to gross earnings