# Data Cleaning

In [1]:
# Libraries
import pandas as pd
pd.set_option('display.max_columns', None)

import numpy as np

import warnings
warnings.filterwarnings('ignore')

I'm going to do a general cleaning of all the tables, it will go like so:
+ Drop `last_update` column.
+ Check if the DTypes are correct.

Some tables need specific cleaning but it will be cleared.

After general cleaning, I'm going to:
1. Transform the data in `old_HDD` so that it contains the `film_id` and `actor_id`, as the relationship between the two tables will be many-to-many.
1. 'Transfer' the `category_id` table to  `film` and substitute the id with the category itself; there's no need for a table with just the category.

`Language` table, as stated before in the `0-data_exploration` notebook, is going to be dropped since there is no way to know in which language is every film without web-scrapping or access to another database.

## Inventory

In [2]:
df = pd.read_csv('../src/inventory.csv')

In [3]:
# drop last_update
df.drop('last_update', axis=1, inplace=True)

In [4]:
# checking data types
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   inventory_id  1000 non-null   int64
 1   film_id       1000 non-null   int64
 2   store_id      1000 non-null   int64
dtypes: int64(3)
memory usage: 23.6 KB


Since we're going to just have one store, we're going to change the `store_id` to 1, we don't know if Deli will open another store in the future.

In [5]:
df.store_id = 1

In [6]:
# saving it for later use
df.set_index('inventory_id', inplace=True)
df.to_csv('../data/inventory.csv')

# Actor

In [7]:
df = pd.read_csv('../src/actor.csv')

In [8]:
# drop last_update
df.drop('last_update', axis=1, inplace=True)

In [9]:
# checking data types
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   actor_id    200 non-null    int64 
 1   first_name  200 non-null    object
 2   last_name   200 non-null    object
dtypes: int64(1), object(2)
memory usage: 26.2 KB


In [10]:
# saving it for later use
df.set_index('actor_id', inplace=True)
df.to_csv('../data/actor.csv')

## Film

In [11]:
df = pd.read_csv('../src/film.csv')

In [12]:
# drop last_update, rental_duration, duplicated language_id column and null original_language_id
df.drop(['last_update', 'rental_rate', 'language_id', 'original_language_id'], axis=1, inplace=True)

In [13]:
# changing column names for clarification
df.rename(columns= {'rental_duration': 'rental_days'}, inplace=True)

In [14]:
# checking data types
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   film_id           1000 non-null   int64  
 1   title             1000 non-null   object 
 2   description       1000 non-null   object 
 3   release_year      1000 non-null   int64  
 4   rental_days       1000 non-null   int64  
 5   length            1000 non-null   int64  
 6   replacement_cost  1000 non-null   float64
 7   rating            1000 non-null   object 
 8   special_features  1000 non-null   object 
dtypes: float64(1), int64(4), object(4)
memory usage: 397.6 KB


In [15]:
df.set_index('film_id', inplace=True)
df.to_csv('../data/film.csv')

## Linking Actor and FilmID

In [16]:
df = pd.read_csv('../src/old_HDD.csv')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   first_name    1000 non-null   object
 1   last_name     1000 non-null   object
 2   title         1000 non-null   object
 3   release_year  1000 non-null   int64 
 4   category_id   1000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 39.2+ KB


In [18]:
# we don't need the release_year col
df.drop('release_year', axis=1, inplace=True)

First, let's add the actor's id.

In [19]:
actor = pd.read_csv('../data/actor.csv')

In [20]:
# merge the first and last name columns
actor['name'] = actor.first_name + ' ' + actor.last_name
actor.drop(['first_name', 'last_name'], axis=1, inplace=True)

df['name'] = df.first_name + ' ' + df.last_name
df.drop(['first_name', 'last_name'], axis=1, inplace=True)

In [21]:
actor.head()

Unnamed: 0,actor_id,name
0,1,PENELOPE GUINESS
1,2,NICK WAHLBERG
2,3,ED CHASE
3,4,JENNIFER DAVIS
4,5,JOHNNY LOLLOBRIGIDA


In [22]:
actor.set_index('actor_id', inplace=True)
id_and_actor = actor.name.to_dict()

In [23]:
df['actor_id'] = df.name.apply(lambda x: next(k for k, v in id_and_actor.items() if v == x))

In [24]:
df.drop('name', axis=1, inplace=True)

Now, the same to with films.

In [25]:
film = pd.read_csv('../data/film.csv')

In [26]:
# drop unneeded cols
film = film[['film_id', 'title']]

In [27]:
film.set_index('film_id', inplace=True)
id_and_film = film.title.to_dict()

df['film_id'] = df.title.apply(lambda x: next(k for k, v in id_and_film.items() if v == x))

df.drop('title', axis=1, inplace=True)

In [28]:
# check the df
df.head()

Unnamed: 0,category_id,actor_id,film_id
0,6,1,1
1,2,1,23
2,13,1,25
3,10,1,106
4,14,1,140


Before saving this dataframe, I'm going to create a new column in films with the category.

## Creating category in film

In [29]:
film = pd.read_csv('../data/film.csv')

In [30]:
category = pd.read_csv('../src/category.csv')
category.drop('last_update', axis=1, inplace=True)

In [31]:
category.head()

Unnamed: 0,category_id,name
0,1,Action
1,2,Animation
2,3,Children
3,4,Classics
4,5,Comedy


In [32]:
# create a dictionary relating film_id and category_id
id_and_id = df[['film_id', 'category_id']]

id_and_id.drop_duplicates('film_id', inplace=True)
id_and_id.sort_values('film_id', inplace=True)
id_and_id.set_index('film_id', inplace=True)

id_and_id = id_and_id.category_id.to_dict()

In [33]:
film.head()

Unnamed: 0,film_id,title,description,release_year,rental_days,length,replacement_cost,rating,special_features
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,86,20.99,PG,"Deleted Scenes,Behind the Scenes"
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,3,48,12.99,G,"Trailers,Deleted Scenes"
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,7,50,18.99,NC-17,"Trailers,Deleted Scenes"
3,4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...,2006,5,117,26.99,G,"Commentaries,Behind the Scenes"
4,5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...,2006,6,130,22.99,G,Deleted Scenes


In [34]:
# create a category column with the index of each category
film['category'] = film.film_id.map(id_and_id)

# Create second dictionary for category
category.set_index('category_id', inplace=True)
id_and_category = category.name.to_dict()

# and substitute number for category
film['category'] = film.category.map(id_and_category)

In [35]:
film.head()

Unnamed: 0,film_id,title,description,release_year,rental_days,length,replacement_cost,rating,special_features,category
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,6,86,20.99,PG,"Deleted Scenes,Behind the Scenes",Documentary
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,3,48,12.99,G,"Trailers,Deleted Scenes",Horror
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,7,50,18.99,NC-17,"Trailers,Deleted Scenes",Documentary
3,4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...,2006,5,117,26.99,G,"Commentaries,Behind the Scenes",
4,5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...,2006,6,130,22.99,G,Deleted Scenes,


In [36]:
# clean df now that it served its purpose
df.drop('category_id', axis=1, inplace=True)

In [37]:
# Check everything is ok
film.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   film_id           1000 non-null   int64  
 1   title             1000 non-null   object 
 2   description       1000 non-null   object 
 3   release_year      1000 non-null   int64  
 4   rental_days       1000 non-null   int64  
 5   length            1000 non-null   int64  
 6   replacement_cost  1000 non-null   float64
 7   rating            1000 non-null   object 
 8   special_features  1000 non-null   object 
 9   category          614 non-null    object 
dtypes: float64(1), int64(4), object(5)
memory usage: 78.3+ KB


Not all movies were categorized correctly and some nul values were created. Dropping the rows wouldn't make sense as the category is not that important for inventory purposes, so we're going to change every `NaN` for `unknown`.

In [38]:
film.fillna('unknown', inplace=True)

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   actor_id  1000 non-null   int64
 1   film_id   1000 non-null   int64
dtypes: int64(2)
memory usage: 15.8 KB


In [40]:
# no nulls nor weird values, so we can save them
film.set_index('film_id', inplace=True)
film.to_csv('../data/film.csv')

df.set_index('actor_id', inplace=True)
df.to_csv('../data/actor_film.csv')