# Data Cleaning
### Loading the files and removing blank data

First we import the required libraries and adjust out display settings to more easily view the datasets.

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 30)
pd.set_option('display.max_rows', 50)

Read each dataset as a Pandas dataframe.

In [2]:
df_bom = pd.read_csv('data/bom.movie_gross.csv')

df_iname = pd.read_csv('data/imdb.name.basics.csv')
df_iakas = pd.read_csv('data/imdb.title.akas.csv')
df_ititlebas = pd.read_csv('data/imdb.title.basics.csv')
df_ititlecrew = pd.read_csv('data/imdb.title.crew.csv')
df_ititlepri = pd.read_csv('data/imdb.title.principals.csv')
df_ititlerate = pd.read_csv('data/imdb.title.ratings.csv')

df_imovtxt = pd.read_csv('data/rt.movie_info.tsv', delimiter='\t') # utf-8 is default
df_ireviewtxt = pd.read_csv('data/rt.reviews.tsv', sep='\t',  encoding='ISO-8859-1', low_memory=False)

df_rtmov = pd.read_csv('data/tmdb.movies.csv') # utf-8 is default
df_rt_budget = pd.read_csv('data/tn.movie_budgets.csv') # utf-8 is default


# df_copy = df.copy() # make a copy of the data in the event we clobber



In [3]:
df_studio_gross = df_bom.copy(deep="True")

In [4]:
df_studio_gross.drop(['title', 'year'], axis=1, inplace=True)


In [5]:
df_studio_gross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 3 columns):
studio            3382 non-null object
domestic_gross    3359 non-null float64
foreign_gross     2037 non-null object
dtypes: float64(1), object(2)
memory usage: 79.5+ KB


### drop nulls and convert domestic_gross to integer

In [33]:
df_studio_gross['foreign_gross'].isnull().sum()


1350

In [34]:
df_studio_gross['foreign_gross'].dropna(inplace=True)

In [35]:
df_studio_gross['foreign_gross'].isnull().sum()

0

In [36]:
df_studio_gross.foreign_gross.astype('int32')

ValueError: invalid literal for int() with base 10: '1131.6'

In [39]:
df_studio_gross.foreign_gross

0       652000000
1       691300000
2       664300000
3       535700000
4       513900000
          ...    
3382          NaN
3383          NaN
3384          NaN
3385          NaN
3386          NaN
Name: foreign_gross, Length: 3387, dtype: object

In [27]:
df_studio_gross['domestic_gross'].isnull().sum()

28

In [28]:
df_studio_gross['domestic_gross'].dropna(inplace=True)

In [29]:
df_studio_gross['domestic_gross'].isnull().sum()

0

In [38]:
#df_studio_gross[df_studio_gross.columns[2]] = df_studio_gross[df_studio_gross.columns[2]].apply(lambda x: x.replace('$','')).apply(lambda x: x.replace(',','')).astype(np.int32)
def clean_currency(x):   
    """ Clean the currency markers
    """
    return(x.replace('$', '').replace(',', ''))
df_studio_gross['foreign_gross'] = df_studio_gross['foreign_gross'].apply(clean_currency)


    



In [30]:
df_studio_gross.domestic_gross.astype('int64')

0       415000000
1       334200000
2       296000000
3       292600000
4       238700000
          ...    
3382         6200
3383         4800
3384         2500
3385         2400
3386         1700
Name: domestic_gross, Length: 3359, dtype: int64

In [None]:
#df_studio_gross.foreign_gross.astype('int64')

In [31]:
type(df_studio_gross.foreign_gross).value_counts()

SyntaxError: invalid syntax (<ipython-input-31-5843d70a1ade>, line 1)

In [None]:
#df_studio_gross['total_gross'] = df_studio_gross['domestic_gross'] + df_studio_gross['foreign_gross']


In [None]:
df_studio_gross.info()

In [None]:
!pwd

In [None]:
df_rt_budget.columns

In [None]:
df_rt_budget.info()

In [None]:
df_rt_budget.isna().sum()

In [None]:
df_rt_budget.head()

In [None]:
df_working = df_rt_budget.copy(deep="True") # make a hard copy of the dataset to work with

In [None]:
df_working.head()

#### Clean the characters from the currency columns and convert to integer for calculations and graphing

In [None]:

df_working[df_working.columns[3:]] = df_working[df_working.columns[3:]].apply(lambda x: x.str.replace('$','')).apply(lambda x: x.str.replace(',','')).astype(np.int64)

In [None]:
df_working.head()

In [None]:
df_working.info()

In [None]:
df_working['foreign_gross'] = df_working['worldwide_gross'] - df_working['domestic_gross']

In [None]:
df_working.head()

In [None]:
df_working['PnL'] = df_working['worldwide_gross'] - df_working['production_budget']

In [None]:
df_working.head()

In [None]:
if df_working['PnL'] > 0:
    df_working['profit'] = 'True'
    elif:
        df_working['profit'] = 'False'

        

#### Save the dataset to a file that we will access in separate notebook

In [None]:
df_working.to_csv('data/tn.movie_budgets_working.csv')

In [None]:
df_iname = pd.read_csv('data/imdb.name.basics.csv')
df_iakas = pd.read_csv('data/imdb.title.akas.csv')
df_ititlebas = pd.read_csv('data/imdb.title.basics.csv')
df_ititlecrew = pd.read_csv('data/imdb.title.crew.csv')
df_ititlepri = pd.read_csv('data/imdb.title.principals.csv')
df_ititlerate = pd.read_csv('data/imdb.title.ratings.csv')

In [None]:
df_ititlerate.head()

In [None]:
df_ititlebas.head()

In [None]:
df_working.head()

#### Merge basics with ratings on column name tconst

In [None]:
df_merged = pd.merge(df_ititlerate, df_ititlebas, on='tconst')

In [None]:
df_iname.head()

In [None]:
df_ititlepri.head()

In [None]:
df_imovtxt
df_ireviewtxt

In [None]:
df_imovtxt.columns

In [None]:
df_ireviewtxt.columns


In [None]:
df_rtmov.head()

In [None]:
df_rtmov.describe()