# Data Cleaning
### Loading the files and removing blank data

First we import the required libraries and adjust out display settings to more easily view the datasets.

In [113]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', lambda x: '%.0f' % x)

Read each dataset as a Pandas dataframe.

In [144]:
df_bom = pd.read_csv('data/bom.movie_gross.csv')

df_iname = pd.read_csv('data/imdb.name.basics.csv')
df_iakas = pd.read_csv('data/imdb.title.akas.csv')
df_ititlebas = pd.read_csv('data/imdb.title.basics.csv')
df_ititlecrew = pd.read_csv('data/imdb.title.crew.csv')
df_ititlepri = pd.read_csv('data/imdb.title.principals.csv')
df_ititlerate = pd.read_csv('data/imdb.title.ratings.csv')

df_imovtxt = pd.read_csv('data/rt.movie_info.tsv', delimiter='\t') # utf-8 is default
df_ireviewtxt = pd.read_csv('data/rt.reviews.tsv', sep='\t',  encoding='ISO-8859-1', low_memory=False)

df_rtmov = pd.read_csv('data/tmdb.movies.csv') # utf-8 is default
df_rt_budget = pd.read_csv('data/tn.movie_budgets.csv') # utf-8 is default


# df_copy = df.copy() # make a copy of the data in the event we clobber



In [148]:
df_bom.sort_values('studio', ascending=True)

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
155,Sea Rex 3D: Journey to a Prehistoric World,3D,6100000,9900000,2010
1457,Red Obsession,A23,13200,,2013
670,Revenge of the Electric Car,A23,151000,,2011
1693,Obvious Child,A24,3100000,,2014
1717,Tusk,A24,1800000,,2014
...,...,...,...,...,...
210,Outside the Law (Hors-la-loi),,96900,3300000,2010
555,Fireflies in the Garden,,70600,3300000,2011
933,Keith Lemon: The Film,,,4000000,2012
1862,Plot for Peace,,7100,,2014


In [146]:
df_bom.loc['year'] == 2018

KeyError: 'year'

In [137]:
filt

title             True
studio            True
domestic_gross    True
foreign_gross     True
year              True
Name: year, dtype: bool

In [48]:
df_studio_gross = df_bom.copy(deep="True")

In [116]:
df_studio_gross.drop(['title', 'year'], axis=1, inplace=True)


KeyError: "['title' 'year'] not found in axis"

In [117]:
df_studio_gross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 4 columns):
studio            3382 non-null object
domestic_gross    3359 non-null float64
foreign_gross     3387 non-null float64
total_gross       3359 non-null float64
dtypes: float64(3), object(1)
memory usage: 106.0+ KB


### drop nulls and convert domestic_gross to integer

In [51]:
df_studio_gross['foreign_gross'].isnull().sum()


1350

In [67]:
df_studio_gross['foreign_gross'].fillna(value=0, inplace=True)

In [68]:
df_studio_gross['foreign_gross'].isnull().sum()

0

In [69]:
df_studio_gross.head()

Unnamed: 0,studio,domestic_gross,foreign_gross
0,BV,415000000.0,652000000
1,BV,334200000.0,691300000
2,WB,296000000.0,664300000
3,WB,292600000.0,535700000
4,P/DW,238700000.0,513900000


In [39]:
df_studio_gross.foreign_gross

0       652000000
1       691300000
2       664300000
3       535700000
4       513900000
          ...    
3382          NaN
3383          NaN
3384          NaN
3385          NaN
3386          NaN
Name: foreign_gross, Length: 3387, dtype: object

In [85]:
df_studio_gross['domestic_gross'].isnull().sum()

28

In [86]:
df_studio_gross['domestic_gross'].dropna(inplace=True)

In [87]:
df_studio_gross['domestic_gross'].isnull().sum()

0

In [83]:
df_studio_gross.foreign_gross = df_studio_gross.foreign_gross.astype(float)

In [None]:
#df_studio_gross.foreign_gross.astype('int64')

In [79]:
df_studio_gross.foreign_gross.value_counts()

0            1350
1200000        23
1100000        14
4200000        12
1900000        12
             ... 
406000000       1
124800000       1
695000          1
209100000       1
177600000       1
Name: foreign_gross, Length: 1205, dtype: int64

In [88]:
df_studio_gross['total_gross'] = df_studio_gross['domestic_gross'] + df_studio_gross['foreign_gross']


In [118]:
df_studio_gross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 4 columns):
studio            3382 non-null object
domestic_gross    3359 non-null float64
foreign_gross     3387 non-null float64
total_gross       3359 non-null float64
dtypes: float64(3), object(1)
memory usage: 106.0+ KB


In [90]:
!pwd

/Users/kennedy/Documents/GitHub/fiprojects/mod_1_movie


In [119]:
df_studio_gross.head()

Unnamed: 0,studio,domestic_gross,foreign_gross,total_gross
0,BV,415000000,652000000,1067000000
1,BV,334200000,691300000,1025500000
2,WB,296000000,664300000,960300000
3,WB,292600000,535700000,828300000
4,P/DW,238700000,513900000,752600000


### Save a copy of cleaned data for graphing in other notebooks

In [121]:
df_studio_gross.to_csv('data/studio_gross.csv', index=False)

In [None]:
df_working = df_rt_budget.copy(deep="True") # make a hard copy of the dataset to work with

In [None]:
df_working.head()

#### Clean the characters from the currency columns and convert to integer for calculations and graphing

In [None]:

df_working[df_working.columns[3:]] = df_working[df_working.columns[3:]].apply(lambda x: x.str.replace('$','')).apply(lambda x: x.str.replace(',','')).astype(np.int64)

In [None]:
df_working.head()

In [None]:
df_working.info()

In [None]:
df_working['foreign_gross'] = df_working['worldwide_gross'] - df_working['domestic_gross']

In [None]:
df_working.head()

In [None]:
df_working['PnL'] = df_working['worldwide_gross'] - df_working['production_budget']

In [None]:
df_working.head()

In [None]:
if df_working['PnL'] > 0:
    df_working['profit'] = 'True'
    elif:
        df_working['profit'] = 'False'

        

#### Save the dataset to a file that we will access in separate notebook

In [None]:
df_working.to_csv('data/tn.movie_budgets_working.csv')

In [None]:
df_iname = pd.read_csv('data/imdb.name.basics.csv')
df_iakas = pd.read_csv('data/imdb.title.akas.csv')
df_ititlebas = pd.read_csv('data/imdb.title.basics.csv')
df_ititlecrew = pd.read_csv('data/imdb.title.crew.csv')
df_ititlepri = pd.read_csv('data/imdb.title.principals.csv')
df_ititlerate = pd.read_csv('data/imdb.title.ratings.csv')

In [None]:
df_ititlerate.head()

In [None]:
df_ititlebas.head()

In [None]:
df_working.head()

#### Merge basics with ratings on column name tconst

In [None]:
df_merged = pd.merge(df_ititlerate, df_ititlebas, on='tconst')

In [None]:
df_iname.head()

In [None]:
df_ititlepri.head()

In [None]:
df_imovtxt
df_ireviewtxt

In [None]:
df_imovtxt.columns

In [None]:
df_ireviewtxt.columns


In [None]:
df_rtmov.head()

In [None]:
df_rtmov.describe()