# Data Cleaning
### Loading the files and removing blank data

First we import the required libraries and adjust out display settings to more easily view the datasets.

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 50)
pd.set_option('display.float_format', lambda x: '%.0f' % x)

Read each dataset as a Pandas dataframe.

In [2]:
df_bom = pd.read_csv('data/bom.movie_gross.csv')

df_iname = pd.read_csv('data/imdb.name.basics.csv')
df_iakas = pd.read_csv('data/imdb.title.akas.csv')
df_ititlebas = pd.read_csv('data/imdb.title.basics.csv')
df_ititlecrew = pd.read_csv('data/imdb.title.crew.csv')
df_ititlepri = pd.read_csv('data/imdb.title.principals.csv')
df_ititlerate = pd.read_csv('data/imdb.title.ratings.csv')

df_imovtxt = pd.read_csv('data/rt.movie_info.tsv', delimiter='\t') # utf-8 is default
df_ireviewtxt = pd.read_csv('data/rt.reviews.tsv', sep='\t',  encoding='ISO-8859-1', low_memory=False)

df_rtmov = pd.read_csv('data/tmdb.movies.csv') # utf-8 is default
df_rt_budget = pd.read_csv('data/tn.movie_budgets.csv') # utf-8 is default


# df_copy = df.copy() # make a copy of the data in the event we clobber



In [3]:
df_bom.sort_values('studio', ascending=True)

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
155,Sea Rex 3D: Journey to a Prehistoric World,3D,6100000,9900000,2010
1457,Red Obsession,A23,13200,,2013
670,Revenge of the Electric Car,A23,151000,,2011
1693,Obvious Child,A24,3100000,,2014
1717,Tusk,A24,1800000,,2014
...,...,...,...,...,...
210,Outside the Law (Hors-la-loi),,96900,3300000,2010
555,Fireflies in the Garden,,70600,3300000,2011
933,Keith Lemon: The Film,,,4000000,2012
1862,Plot for Peace,,7100,,2014


In [None]:
df_bom.loc['year'] == 2018

df_studio_gross = df_bom.copy(deep="True")

df_studio_gross.drop(['title', 'year'], axis=1, inplace=True)


df_studio_gross.info()

### drop nulls and convert domestic_gross to integer

In [None]:
df_studio_gross['foreign_gross'].isnull().sum()
df_studio_gross['foreign_gross'].fillna(value=0, inplace=True)
df_studio_gross['foreign_gross'].isnull().sum()
df_studio_gross.head()
df_studio_gross.foreign_gross
df_studio_gross['domestic_gross'].isnull().sum()
df_studio_gross['domestic_gross'].dropna(inplace=True)
df_studio_gross['domestic_gross'].isnull().sum()
df_studio_gross.foreign_gross = df_studio_gross.foreign_gross.astype(float)
#df_studio_gross.foreign_gross.astype('int64')
df_studio_gross.foreign_gross.value_counts()
df_studio_gross['total_gross'] = df_studio_gross['domestic_gross'] + df_studio_gross['foreign_gross']
df_studio_gross.info()
df_studio_gross.head()

### Save a copy of cleaned data for graphing in other notebooks

In [None]:
df_studio_gross.to_csv('data/studio_gross.csv', index=False)

df_working = df_rt_budget.copy(deep="True") # make a hard copy of the dataset to work with

df_working.head()

#### Clean the characters from the currency columns and convert to integer for calculations and graphing

In [None]:

df_working[df_working.columns[3:]] = df_working[df_working.columns[3:]].apply(lambda x: x.str.replace('$','')).apply(lambda x: x.str.replace(',','')).astype(np.int64)

df_working.head()

df_working.info()

df_working['foreign_gross'] = df_working['worldwide_gross'] - df_working['domestic_gross']

df_working.head()

df_working['PnL'] = df_working['worldwide_gross'] - df_working['production_budget']

df_working.head()

if df_working['PnL'] > 0:
    df_working['profit'] = 'True'
    elif:
        df_working['profit'] = 'False'        

#### Save the dataset to a file that we will access in separate notebook

In [None]:
df_working.to_csv('data/tn.movie_budgets_working.csv')

df_iname = pd.read_csv('data/imdb.name.basics.csv')
df_iakas = pd.read_csv('data/imdb.title.akas.csv')
df_ititlebas = pd.read_csv('data/imdb.title.basics.csv')
df_ititlecrew = pd.read_csv('data/imdb.title.crew.csv')
df_ititlepri = pd.read_csv('data/imdb.title.principals.csv')
df_ititlerate = pd.read_csv('data/imdb.title.ratings.csv')

df_ititlerate.head()

df_ititlebas.head()

df_working.head()

#### Merge basics with ratings on column name tconst

In [None]:
df_merged = pd.merge(df_ititlerate, df_ititlebas, on='tconst')

In [None]:
df_iname.head()

In [None]:
df_ititlepri.head()

df_imovtxt
df_ireviewtxt

df_imovtxt.columns

df_ireviewtxt.columns

df_rtmov.head()

df_rtmov.describe()