In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.linear_model import LinearRegression 

In [None]:
movie_df = pd.read_csv('cost_revenue_dirty.csv')
movie_df.shape

In [None]:
movie_df.head()

In [None]:
movie_df.dtypes

In [None]:
movie_df['USD_Production_Budget'] = movie_df['USD_Production_Budget'].astype(str).str.replace('$', '')
movie_df['USD_Worldwide_Gross'] = movie_df['USD_Worldwide_Gross'].astype(str).str.replace('$', '')
movie_df['USD_Domestic_Gross'] = movie_df['USD_Domestic_Gross'].astype(str).str.replace('$', '')

movie_df['USD_Production_Budget'] = movie_df['USD_Production_Budget'].astype(str).str.replace(',', '')
movie_df['USD_Worldwide_Gross'] = movie_df['USD_Worldwide_Gross'].astype(str).str.replace(',', '')
movie_df['USD_Domestic_Gross'] = movie_df['USD_Domestic_Gross'].astype(str).str.replace(',', '')

movie_df['USD_Production_Budget'] = movie_df['USD_Production_Budget'].astype(np.float64)
movie_df['USD_Worldwide_Gross'] = movie_df['USD_Worldwide_Gross'].astype(np.float64)
movie_df['USD_Domestic_Gross'] = movie_df['USD_Domestic_Gross'].astype(np.float64)

In [None]:
movie_df['Release_Date'] = pd.to_datetime(movie_df['Release_Date'])

In [None]:
movie_df.describe()

In [None]:
movie_df[movie_df['USD_Domestic_Gross'] == 0].shape
# 512 movies grossed 0 domestically

In [None]:
movie_df[movie_df['USD_Domestic_Gross'] == 0].sort_values('USD_Production_Budget', ascending=False).head(3)

In [None]:
movie_df[movie_df['USD_Worldwide_Gross'] == 0].sort_values('USD_Production_Budget', ascending=False).head(3)

In [None]:
movie_df[(movie_df['USD_Worldwide_Gross'] > 0) & (movie_df['USD_Domestic_Gross'] == 0)]

In [None]:
# alternative
movie_df.query('USD_Domestic_Gross == 0 and USD_Worldwide_Gross != 0')

In [None]:
movie_df[movie_df['Release_Date'] >= '2018-05-01'].shape

In [None]:
df_clean = movie_df[movie_df['Release_Date'] < '2018-05-01']

In [None]:
df_clean.head()

In [None]:
df_clean['Total_Gross'] = df_clean['USD_Worldwide_Gross'] + df_clean['USD_Domestic_Gross']

In [None]:
df_clean[df_clean['Total_Gross'] < df_clean['USD_Production_Budget']]

In [None]:
movies_losses = df_clean[df_clean['Total_Gross'] < df_clean['USD_Production_Budget']]
print(f'{round(len(movies_losses) / len(df_clean) * 100, 2)}%')

### Seaborn Visualization

In [None]:
plt.figure(figsize=(8, 4), dpi=200)
with sb.axes_style('darkgrid'): # style single chart instead of applying to all charts
    ax = sb.scatterplot(
        data=df_clean,
        x='USD_Production_Budget',
        y='USD_Worldwide_Gross',
        hue='USD_Worldwide_Gross',
        size='USD_Worldwide_Gross'
)

ax.set(
    ylim=(0, 3e9),
    xlim=(0, 4.5e8),
    ylabel='Revene in $ billions',
    xlabel='Budget in $ 100 millions'
)
plt.show()

In [None]:
df_clean.head(1)

In [None]:
plt.figure(figsize=(8, 4), dpi=200)
with sb.axes_style('darkgrid'):
    ax = sb.scatterplot(
        data=df_clean,
        x='Release_Date',
        y='USD_Production_Budget',
        hue='USD_Worldwide_Gross',
        size='USD_Worldwide_Gross'
    )
    
    ax.set(
    ylim=(0, 4.5e8),
    ylabel='Budget in $100 millions',
    xlabel='Year'
)
    
plt.show()

In [None]:
df_clean.head(1)

In [None]:
df_clean['Decade'] = pd.DatetimeIndex(df_clean['Release_Date']).year // 10 * 10

In [None]:
df_clean.head(5)

In [None]:
old_films = df_clean[df_clean['Decade'] < 1970]
new_films = df_clean[df_clean['Decade'] >= 1970]

In [None]:
old_films.shape

In [None]:
old_films.sort_values('USD_Production_Budget', ascending=False)

### Linear Regression

In [None]:
plt.figure(figsize=(8, 4), dpi=200)
with sb.axes_style('whitegrid'):
    sb.regplot(
        data=old_films,
        x='USD_Production_Budget',
        y='USD_Worldwide_Gross',
        scatter_kws={'alpha': 0.4},
        line_kws={'color': 'black'}
    )

In [None]:
plt.figure(figsize=(8, 4), dpi=200)
with sb.axes_style('darkgrid'):
    ax = sb.regplot(
        data=new_films,
        x='USD_Production_Budget',
        y='USD_Worldwide_Gross',
        color='#2f4b7c',
        scatter_kws={'alpha': 0.3},
        line_kws={'color': '#ff7c43'}
    )
    ax.set(
        ylim=(0, 3e9),
        xlim=(0, 4.5e8),
        ylabel='Revenue in $ billions',
        xlabel='Budget in $100 millions'
    )

In [None]:
linreg = LinearRegression()

In [None]:
X = pd.DataFrame(new_films, columns=['USD_Production_Budget']) # feature / independent variable

y = pd.DataFrame(new_films, columns=['USD_Worldwide_Gross']) # "answer key" 

In [None]:
linreg.fit(X, y)


In [None]:
linreg.intercept_

In [None]:
linreg.coef_ # slope
# for every increase in 1 unit of budget, revenue increase by 3.12 unit

In [None]:
linreg.score(X, y)
# Explained variance (R^2), Model explains 56% of variance in movie revenue

### Making Prediction

In [None]:
supposed_budget = 350e6
revenue_estimate = linreg.intercept_[0] + linreg.coef_[0, 0] * supposed_budget
revenue_estimate = round(revenue_estimate, -6)
revenue_estimate