# Linear Regression and Data Visualization with Seaborn

In [82]:
import pandas as pd

In [83]:
df = pd.read_csv('data/cost_revenue_dirty.csv')

## Data Exploration and Cleaning

In [84]:
print(f'The data have {df.shape[0]} rows and {df.shape[1]} columns')
print(f'The data have {df.isna().sum().sum()} NaN values')
print(f'The data have {df.duplicated().sum()} duplicated rows')
print(f'Type of columns:\n{df.dtypes}')

The data have 5391 rows and 6 columns
The data have 0 NaN values
The data have 0 duplicated rows
Type of columns:
Rank                      int64
Release_Date             object
Movie_Title              object
USD_Production_Budget    object
USD_Worldwide_Gross      object
USD_Domestic_Gross       object
dtype: object


In [85]:
# Removing $ and , from the columns
df['USD_Production_Budget'] = df['USD_Production_Budget'].str.replace('$', '').str.replace(',', '')
df['USD_Worldwide_Gross'] = df['USD_Worldwide_Gross'].str.replace('$', '').str.replace(',', '')
df['USD_Domestic_Gross'] = df['USD_Domestic_Gross'].str.replace('$', '').str.replace(',', '')

In [86]:
df.Release_Date = pd.to_datetime(df.Release_Date)

## Investigate the Films that had Zero Revenue

In [87]:
# the average production budget of the films in the data set
avg_production_budget = df['USD_Production_Budget'].astype('float').mean()
avg_production_budget

31113737.57837136

In [88]:
# the average worldwide gross revenue of films
avg_worldwide_gross = df['USD_Worldwide_Gross'].astype('float').mean()
avg_worldwide_gross

88855421.96271564

In [89]:
min_domestic_gross = df['USD_Domestic_Gross'].astype('float').min()
min_worldwide_gross = df['USD_Worldwide_Gross'].astype('float').min()
print(min_domestic_gross)
print(min_worldwide_gross)

0.0
0.0


In [90]:
# Are the bottom 25% of films actually profitable or do they lose money?
bottom_25_percent = df['USD_Worldwide_Gross'].astype('float').quantile(0.25)
bottom_25_percent

3865206.0

In [91]:
# the highest production budget and highest worldwide gross revenue of any film
max_production_budget = df['USD_Production_Budget'].astype('float').max()
max_worldwide_gross = df['USD_Worldwide_Gross'].astype('float').max()
print(max_production_budget)
print(max_worldwide_gross)

425000000.0
2783918982.0


In [92]:
# How much revenue did the lowest budget films make?
min_production_budget = df['USD_Production_Budget'].astype('float').min()
min_budget_films = df[df['USD_Production_Budget'].astype('float') == min_production_budget]
min_budget_revenue = min_budget_films['USD_Worldwide_Gross'].astype('float')
min_budget_revenue

2427    181041.0
Name: USD_Worldwide_Gross, dtype: float64

In [93]:
# How much revenue did the highest budget films make?
max_budget_films = df[df['USD_Production_Budget'].astype(
    'float') == max_production_budget]
max_budget_revenue = max_budget_films['USD_Worldwide_Gross'].astype('float')
max_budget_revenue

3529    2.783919e+09
Name: USD_Worldwide_Gross, dtype: float64

In [94]:
# How many films grossed $0 domestically?
zero_domestic_gross = df[df['USD_Domestic_Gross'].astype('float') == 0]
zero_domestic_gross.sort_values('USD_Production_Budget', ascending=False)

Unnamed: 0,Rank,Release_Date,Movie_Title,USD_Production_Budget,USD_Worldwide_Gross,USD_Domestic_Gross
4526,3500,2013-12-31,Re-Kill,9500000,0,0
4743,4955,2014-12-08,Jesse,950000,0,0
3817,4954,2010-12-31,Trance,950000,0,0
4689,4953,2014-10-01,Banshee Chapter,950000,78122,0
4843,4956,2015-03-03,Ask Me Anything,950000,0,0
...,...,...,...,...,...,...
4163,5306,2012-05-18,Indie Game: The Movie,100000,0,0
4536,5308,2013-12-31,Echo Dr.,100000,0,0
4783,5307,2014-12-31,"Dude, Where's My Dog",100000,0,0
5028,5312,2015-10-11,The Night Visitor,100000,0,0


In [95]:
# How many films grossed $0 worldwide? What are the highest budget films that had no revenue internationally (i.e., the biggest flops)?
zero_worldwide_gross = df[df['USD_Worldwide_Gross'].astype('float') == 0]
zero_worldwide_gross.sort_values('USD_Production_Budget', ascending=False)

Unnamed: 0,Rank,Release_Date,Movie_Title,USD_Production_Budget,USD_Worldwide_Gross,USD_Domestic_Gross
4526,3500,2013-12-31,Re-Kill,9500000,0,0
4843,4956,2015-03-03,Ask Me Anything,950000,0,0
3817,4954,2010-12-31,Trance,950000,0,0
4743,4955,2014-12-08,Jesse,950000,0,0
4736,4958,2014-11-21,Food Chains,913000,0,0
...,...,...,...,...,...,...
4163,5306,2012-05-18,Indie Game: The Movie,100000,0,0
4536,5308,2013-12-31,Echo Dr.,100000,0,0
4783,5307,2014-12-31,"Dude, Where's My Dog",100000,0,0
5116,5310,2015-12-31,Lunchtime Heroes,100000,0,0
