# Movies Industry Data Analysis

### Data Importing

In [3]:
#Import libraries
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8) #Adjust the configuration of the plots we will create

In [None]:
#Read the data
df = pd.read_csv('/Users/hoanglinh/Project/Important/PortfolioPreject/Project 1/movies.csv')
df.head()

### Data Cleaning

In [None]:
#missing data
for col in df.columns:
    missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, missing))

In [None]:
df.fillna(value=0, inplace=True)

In [None]:
# data types for columns
df.dtypes

In [None]:
#Check missing data again
for col in df.columns:
    missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, missing))

In [None]:
#Change data type
df['budget']= df['budget'].astype('int64')
df['gross']= df['gross'].astype('int64')
df['votes']= df['votes'].astype('int64')
df['runtime']= df['runtime'].astype('int64')

In [None]:
df

In [None]:
#df["yearcorrect"]= df["released"].str.findall("\d{4}").apply(lambda x:  x[0])
#df['released'] = pd.to_datetime(df['released'])
df.info()

In [None]:
# Correct release year 

df['date'] = df['released'].str.extract(r'(\w+ \d+, \d+)')
df['date'] = pd.to_datetime(df['date'], format='%B %d, %Y')
df['yearcorrected'] = df['date'].dt.year
df.drop(columns=['released', 'year'], inplace=True)
df

In [None]:
df.info()

In [None]:
#df['yearcorrected']= df['yearcorrected'].astype('int64')
for col in df.columns:
    missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, missing))
    

In [None]:
df.fillna(value=0, inplace=True)

In [None]:
df['yearcorrected']= df['yearcorrected'].astype('int64')

In [None]:
df

In [None]:
df.sort_values(by=['gross'], inplace=False, ascending=False)

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
#Drop duplicate
#df['company'].drop_duplicates().sort_values(ascending=False)
company2=df['company'].drop_duplicates()
company2.astype(str).sort_values(ascending=False)


### Data Correlation

In [None]:
df

#### Budget and Gross

In [None]:
# Scatter plot with budget and gross


plt.scatter(x=df['budget'], y=df['gross'])

plt.title('Budget and Gross Earnings')

plt.xlabel('Gross Earnings')
plt.ylabel('Film Budget')
plt.show()



In [None]:
# Seaborn plot with Gross and Budget 

sns.regplot(x='budget', y='gross', data=df, scatter_kws={'color':'red'}, line_kws={'color':'black'})



In [None]:
#Correlation between variables

#df2=df[('budget'), ('gross'), ('runtime'), ('score'), ('votes'), ('year')].copy()

df2=df[['budget', 'gross', 'yearcorrected', 'runtime', 'votes', 'score']].copy()
df2.corr()

In [None]:
#Correlation of rating, genre and country

#rating
df['rating']=df['rating'].astype('category')
df['rated']=df['rating'].cat.codes

#genre
df['genre']=df['genre'].astype('category')
df['genre2']=df['genre'].cat.codes

#country
df['country']=df['country'].astype('category')
df['country2']=df['country'].cat.codes

df3=df[['budget', 'gross', 'yearcorrected', 'runtime', 'votes', 'score', 'rated', 'country2', 'genre2']].copy()
df3.corr()




The result shows that gross earnings of a film has the largest correlation with the budget of that film. The higher the budget, the higher the earnings. 
The result also shows that runtime, votes, country and genre also has impact on the gross earnings, however, it's not as strong as budget variable. 
Other variables such as year, score, and rates don't have strong correlation with the gross earnings of a film.

In [None]:
# Visualize correlation matrix

correlation_matrix = df3.corr()

sns.heatmap(correlation_matrix, annot=True)

plt.title('Correlation Matrix')

plt.xlabel('Movie Features')
plt.ylabel('Movie Features')

plt.show()

In [None]:
#Look at other variable

df_numerized=df

for col_name in df_numerized: 
    if(df_numerized[col_name].dtype == 'object'):
        df_numerized[col_name] = df_numerized[col_name].astype('category')
        df_numerized[col_name] = df_numerized[col_name].cat.codes
        
        
#df_numerized.drop(columns=['name', 'rating', 'genre', 'country'], inplace=True)
        
df_numerized


In [None]:
df_numerized.corr()

In [None]:
correlation_mtx = df_numerized.corr()

sns.heatmap(correlation_mtx, annot=True)

plt.title('Full Data Correlation Matrix')

plt.xlabel('Movie Features')
plt.ylabel('Movie Features')

plt.show()

In [None]:
correlation_mtx = df_numerized.corr()

corr_pairs = correlation_mtx.unstack()


In [None]:
sorted_pairs = corr_pairs.sort_values()



In [None]:
high_corr = sorted_pairs[(sorted_pairs)>0.5]

high_corr