# PROCESO DE ETL

## Extraccion de datos

In [1]:
#Importamos las librerias necesarias

import pandas as pd
import numpy as np

In [2]:
# Abrimos los CSV con pandas para su posterior tratamiento

amazon = pd.read_csv('./MLOpsReviews/amazon_prime_titles.csv', sep= ',')
disney = pd.read_csv('./MLOpsReviews/disney_plus_titles.csv', sep= ',')
hulu = pd.read_csv('./MLOpsReviews/hulu_titles.csv', sep= ',')
netflix = pd.read_csv('./MLOpsReviews/netflix_titles.csv', sep=',')

### Visualización de los datasets

In [4]:
amazon.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,The Grand Seduction,Don McKellar,"Brendan Gleeson, Taylor Kitsch, Gordon Pinsent",Canada,"March 30, 2021",2014,,113 min,"Comedy, Drama",A small fishing village must procure a local d...


In [5]:
disney.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Duck the Halls: A Mickey Mouse Christmas Special,"Alonso Ramirez Ramos, Dave Wasson","Chris Diamantopoulos, Tony Anselmo, Tress MacN...",,"November 26, 2021",2016,TV-G,23 min,"Animation, Family",Join Mickey and the gang as they duck the halls!


In [6]:
hulu.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Ricky Velez: Here's Everything,,,,"October 24, 2021",2021,TV-MA,,"Comedy, Stand Up",​Comedian Ricky Velez bares it all with his ho...


In [7]:
netflix.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."


## Transformaciones

### Generamos la nueva columna 'id'

In [3]:
# Generamos el nuevo id
 
amazon['id'] = 'a' + amazon['show_id']
disney['id'] = 'd' + disney['show_id']
hulu['id'] = 'h' + hulu['show_id']
netflix['id'] = 'n' + netflix['show_id']

# Vizualizo para corroborar que los cambios hayan sido correctamente aplicados

amazon['id']
disney['id']
hulu['id']
netflix['id']

0          ns1
1          ns2
2          ns3
3          ns4
4          ns5
         ...  
8802    ns8803
8803    ns8804
8804    ns8805
8805    ns8806
8806    ns8807
Name: id, Length: 8807, dtype: object

In [4]:
#Reemplazo los valores nulos de la columna rating por G (general for all audiences)

amazon['rating'].fillna('G',inplace=True)
disney['rating'].fillna('G',inplace=True)
hulu['rating'].fillna('G',inplace=True)
netflix['rating'].fillna('G',inplace=True)

In [5]:
#Vizualizo para corroborar que los cambios hayan sido correctamente aplicados

amazon['rating']
disney['rating']
hulu['rating']
netflix['rating']

0       PG-13
1       TV-MA
2       TV-MA
3       TV-MA
4       TV-MA
        ...  
8802        R
8803    TV-Y7
8804        R
8805       PG
8806    TV-14
Name: rating, Length: 8807, dtype: object

### Estandarización del formato de fecha a AAAA-mm-dd

In [6]:
amazon['date_added'] = pd.to_datetime(amazon['date_added'])
disney['date_added'] = pd.to_datetime(disney['date_added'])
hulu['date_added'] = pd.to_datetime(hulu['date_added'])
netflix['date_added'] = pd.to_datetime(netflix['date_added'])

In [7]:
#Vizualizo para corroborar que los cambios hayan sido correctamente aplicados

amazon['date_added']
disney['date_added']
hulu['date_added']
netflix['date_added']

0      2021-09-25
1      2021-09-24
2      2021-09-24
3      2021-09-24
4      2021-09-24
          ...    
8802   2019-11-20
8803   2019-07-01
8804   2019-11-01
8805   2020-01-11
8806   2019-03-02
Name: date_added, Length: 8807, dtype: datetime64[ns]

In [8]:
#Convierto los campos de texto en minúsculas

amazon = amazon.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
disney = disney.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
hulu = hulu.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
netflix = netflix.apply(lambda x: x.str.lower() if x.dtype == "object" else x)

### Division del campo duration en 'duration_int' y 'duration_type'

In [9]:
amazon['duration'] = amazon['duration'].str.replace(r'', '')
amazon[['duration_int','duration_type']] = amazon['duration'].str.split(expand=True)

disney['duration'] = disney['duration'].str.replace(r'', '')
disney[['duration_int','duration_type']] = disney['duration'].str.split(expand=True)

hulu['duration'] = hulu['duration'].str.replace(r'', '')
hulu[['duration_int','duration_type']] = hulu['duration'].str.split(expand=True)

netflix['duration'] = netflix['duration'].str.replace(r'', '')
netflix[['duration_int','duration_type']] = netflix['duration'].str.split(expand=True)

In [10]:
netflix.head(1)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,id,duration_int,duration_type
0,s1,movie,dick johnson is dead,kirsten johnson,,united states,2021-09-25,2020,pg-13,90 min,documentaries,"as her father nears the end of his life, filmm...",ns1,90,min


### Reorganizamos los datasets para mejorar su lectura 

In [11]:
columns_order = ['id', 'show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year',	'rating', 'duration', 'listed_in', 'description', 'duration_int', 'duration_type']

netflix[columns_order]

Unnamed: 0,id,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,duration_int,duration_type
0,ns1,s1,movie,dick johnson is dead,kirsten johnson,,united states,2021-09-25,2020,pg-13,90 min,documentaries,"as her father nears the end of his life, filmm...",90,min
1,ns2,s2,tv show,blood & water,,"ama qamata, khosi ngema, gail mabalane, thaban...",south africa,2021-09-24,2021,tv-ma,2 seasons,"international tv shows, tv dramas, tv mysteries","after crossing paths at a party, a cape town t...",2,seasons
2,ns3,s3,tv show,ganglands,julien leclercq,"sami bouajila, tracy gotoas, samuel jouy, nabi...",,2021-09-24,2021,tv-ma,1 season,"crime tv shows, international tv shows, tv act...",to protect his family from a powerful drug lor...,1,season
3,ns4,s4,tv show,jailbirds new orleans,,,,2021-09-24,2021,tv-ma,1 season,"docuseries, reality tv","feuds, flirtations and toilet talk go down amo...",1,season
4,ns5,s5,tv show,kota factory,,"mayur more, jitendra kumar, ranjan raj, alam k...",india,2021-09-24,2021,tv-ma,2 seasons,"international tv shows, romantic tv shows, tv ...",in a city of coaching centers known to train i...,2,seasons
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,ns8803,s8803,movie,zodiac,david fincher,"mark ruffalo, jake gyllenhaal, robert downey j...",united states,2019-11-20,2007,r,158 min,"cult movies, dramas, thrillers","a political cartoonist, a crime reporter and a...",158,min
8803,ns8804,s8804,tv show,zombie dumb,,,,2019-07-01,2018,tv-y7,2 seasons,"kids' tv, korean tv shows, tv comedies","while living alone in a spooky town, a young g...",2,seasons
8804,ns8805,s8805,movie,zombieland,ruben fleischer,"jesse eisenberg, woody harrelson, emma stone, ...",united states,2019-11-01,2009,r,88 min,"comedies, horror movies",looking to survive in a world taken over by zo...,88,min
8805,ns8806,s8806,movie,zoom,peter hewitt,"tim allen, courteney cox, chevy chase, kate ma...",united states,2020-01-11,2006,pg,88 min,"children & family movies, comedies","dragged from civilian life, a former superhero...",88,min
