In [1]:
# !pip install fuzzywuzzy

In [2]:
import pandas as pd

from collections import Counter
from fuzzywuzzy import fuzz
import pickle



# Cargamos los datos

In [3]:
# cargamos el csv donde tenemos todos los títulos que tenemos en Netflix
df_titulos = pd.read_csv('data/netflix_titles.csv',index_col=0)
df_titulos.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [4]:
# cargamos el csv donde tenemos todas las producciones de Netflix
df_originals = pd.read_csv('data/netflix_originals.csv',index_col=0)
df_originals.head() 

Unnamed: 0,Title,Genre,Premiere,Runtime,IMDB Score,Language
0,Enter the Anime,Documentary,"August 5, 2019",58,2.5,English/Japanese
1,Dark Forces,Thriller,"August 21, 2020",81,2.6,Spanish
2,The App,Science fiction/Drama,"December 26, 2019",79,2.6,Italian
3,The Open House,Horror thriller,"January 19, 2018",94,3.2,English
4,Kaali Khuhi,Mystery,"October 30, 2020",90,3.4,Hindi


In [5]:
# miramos la forma del dataframe
df_titulos.shape


(8807, 12)

In [6]:
# miramos la forma del dataframe
df_originals.shape

(513, 6)

## Juntamos los dos dataframes

In [7]:
# juntamos los dos dataframes
df = df_titulos.merge(df_originals, left_on='title', right_on='Title', how='inner')
df.sample(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,Title,Genre,Premiere,Runtime,IMDB Score,Language
312,s3967,Movie,The Highwaymen,John Lee Hancock,"Kevin Costner, Woody Harrelson, Kathy Bates, J...",United States,"March 29, 2019",2019,R,132 min,Dramas,Two steely former Texas Rangers are tasked wit...,The Highwaymen,Crime drama,"March 29, 2019",131,6.9,English
414,s5060,Movie,A Futile and Stupid Gesture,David Wain,"Will Forte, Domhnall Gleeson, Martin Mull, Joe...",United States,"January 26, 2018",2018,TV-MA,,Comedies,"In a brief life full of triumph and failure, ""...",A Futile and Stupid Gesture,Biographical/Comedy,"January 26, 2018",101,6.8,English
25,s1106,Movie,Concrete Cowboy,Ricky Staub,"Idris Elba, Caleb McLaughlin, Jharrel Jerome, ...","United Kingdom, United States","April 2, 2021",2021,R,112 min,"Dramas, Independent Movies",Sent to live with his estranged father for the...,Concrete Cowboy,Drama,"April 2, 2021",111,6.3,English
102,s1786,Movie,Holidate,John Whitesell,"Emma Roberts, Luke Bracey, Kristin Chenoweth, ...",United States,"October 28, 2020",2020,TV-MA,,"Comedies, Romantic Movies","Fed up with being single on holidays, two stra...",Holidate,Romantic comedy/Holiday,"October 28, 2020",104,6.1,English
382,s4818,Movie,"To Each, Her Own",Myriam Aziza,"Sarah Stern, Jean-Christophe Folly, Julia Piat...",France,"June 24, 2018",2018,TV-MA,,"Comedies, International Movies, LGBTQ Movies",Just as Simone works up the courage to tell he...,"To Each, Her Own",Romantic comedy,"June 24, 2018",95,5.3,French


In [8]:
# Eliminamos aquellas columnas fuera de nuestro interés 
df.drop(['release_year', 'Runtime', 'description','Title', "show_id"], axis = 1, inplace=True)
df.head(2)

Unnamed: 0,type,title,director,cast,country,date_added,rating,duration,listed_in,Genre,Premiere,IMDB Score,Language
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentaries,Documentary,"October 2, 2020",7.5,English
1,Movie,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",R,82 min,Action & Adventure,Action,"April 24, 2020",6.7,English


Tenemos columnas en mayúsculas y otras en minúsculas, y con espacios. Vamos a intentar homogeneizarlo:

In [9]:
columnas_nuevas = {col: col.lower().replace(' ','_') for col in df.columns}

columnas_nuevas

{'type': 'type',
 'title': 'title',
 'director': 'director',
 'cast': 'cast',
 'country': 'country',
 'date_added': 'date_added',
 'rating': 'rating',
 'duration': 'duration',
 'listed_in': 'listed_in',
 'Genre': 'genre',
 'Premiere': 'premiere',
 'IMDB Score': 'imdb_score',
 'Language': 'language'}

In [10]:
df.rename(columns=columnas_nuevas, inplace=True)
df.head()

Unnamed: 0,type,title,director,cast,country,date_added,rating,duration,listed_in,genre,premiere,imdb_score,language
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentaries,Documentary,"October 2, 2020",7.5,English
1,Movie,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",R,82 min,Action & Adventure,Action,"April 24, 2020",6.7,English
2,Movie,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",,"June 30, 2021",TV-MA,,"Dramas, International Movies, Thrillers",Thriller,"April 14, 2021",5.7,Polish
3,Movie,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,"May 27, 2021",TV-PG,,"Children & Family Movies, Dramas, Faith & Spir...",Drama,"May 27, 2021",6.7,English
4,Movie,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,"May 27, 2021",TV-MA,118 min,"Horror Movies, International Movies, Thrillers",Horror,"May 26, 2021",5.2,Thai


# Análisis exploratorio

In [11]:
# Número de filas  y columnas del dataframe
df.shape

(513, 13)

In [12]:
# Información básica de cada una de las columnas del df
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 513 entries, 0 to 512
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   type        513 non-null    object 
 1   title       513 non-null    object 
 2   director    491 non-null    object 
 3   cast        422 non-null    object 
 4   country     499 non-null    object 
 5   date_added  513 non-null    object 
 6   rating      513 non-null    object 
 7   duration    276 non-null    object 
 8   listed_in   513 non-null    object 
 9   genre       513 non-null    object 
 10  premiere    513 non-null    object 
 11  imdb_score  513 non-null    float64
 12  language    513 non-null    object 
dtypes: float64(1), object(12)
memory usage: 56.1+ KB


In [13]:
# Contamos el número de filas duplicadas del dataframe
df.duplicated().sum()

0

In [14]:
# contamos el número de valores nulos de cada columna
df.isnull().sum()

type            0
title           0
director       22
cast           91
country        14
date_added      0
rating          0
duration      237
listed_in       0
genre           0
premiere        0
imdb_score      0
language        0
dtype: int64

In [15]:
# Contamos el porcentaje de valores nulos de cada columna
df.isnull().sum()/df.shape[0]

type          0.000000
title         0.000000
director      0.042885
cast          0.177388
country       0.027290
date_added    0.000000
rating        0.000000
duration      0.461988
listed_in     0.000000
genre         0.000000
premiere      0.000000
imdb_score    0.000000
language      0.000000
dtype: float64

In [16]:
# Principales estadísticos de las columnas numéricas
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
imdb_score,513.0,6.210916,0.96885,2.5,5.6,6.3,6.9,9.0


In [17]:
# Principales estadísticos de las columnas categóricas
df.describe(include='object').T

Unnamed: 0,count,unique,top,freq
type,513,1,Movie,513
title,513,513,Dick Johnson Is Dead,1
director,491,462,McG,3
cast,422,421,Shawn Mendes,2
country,499,81,United States,274
date_added,513,354,"October 18, 2019",5
rating,513,10,TV-MA,254
duration,276,86,98 min,20
listed_in,513,103,Documentaries,69
genre,513,106,Documentary,132


Vemos que para la columna de `type` solo tenemos un tipo, ¿merece la pena entonces mantener esta columna? La verdad es que no, así que vamos a eliminarla

In [18]:
df.drop(['type'],axis=1,inplace=True)
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,duration,listed_in,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentaries,Documentary,"October 2, 2020",7.5,English
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",R,82 min,Action & Adventure,Action,"April 24, 2020",6.7,English
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",,"June 30, 2021",TV-MA,,"Dramas, International Movies, Thrillers",Thriller,"April 14, 2021",5.7,Polish
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,"May 27, 2021",TV-PG,,"Children & Family Movies, Dramas, Faith & Spir...",Drama,"May 27, 2021",6.7,English
4,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,"May 27, 2021",TV-MA,118 min,"Horror Movies, International Movies, Thrillers",Horror,"May 26, 2021",5.2,Thai


Dentro del análisis exploratorio es importante conocer todas las variables, con las categóricas es importante ver todas las categorías que tenemos para cada una de ellas y cuáles son sus frecuencias. 

In [19]:
# creemos un dataframe solo con las variables categóricas
df_cat = df.select_dtypes(include='object')  # select_dtypes nos permite seleccionar las columnas de un tipo de dato en concreto
df_cat.head()

Unnamed: 0,title,director,cast,country,date_added,rating,duration,listed_in,genre,premiere,language
0,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentaries,Documentary,"October 2, 2020",English
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",R,82 min,Action & Adventure,Action,"April 24, 2020",English
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",,"June 30, 2021",TV-MA,,"Dramas, International Movies, Thrillers",Thriller,"April 14, 2021",Polish
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,"May 27, 2021",TV-PG,,"Children & Family Movies, Dramas, Faith & Spir...",Drama,"May 27, 2021",English
4,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,"May 27, 2021",TV-MA,118 min,"Horror Movies, International Movies, Thrillers",Horror,"May 26, 2021",Thai


Si bien es cierto que hemos sacado todas la categóricas, ¿tiene sentido explorarlas todas? Podríamos pensar que variables como los títulos de cada peli no nos interesa, porque tienen demasiadas categorías. En este caso podremos eliminarlas de nuestro df_cat

In [20]:
# eliminamos la columna title
df_cat.drop(['title'],axis=1,inplace=True)
df_cat.head()

Unnamed: 0,director,cast,country,date_added,rating,duration,listed_in,genre,premiere,language
0,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentaries,Documentary,"October 2, 2020",English
1,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",R,82 min,Action & Adventure,Action,"April 24, 2020",English
2,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",,"June 30, 2021",TV-MA,,"Dramas, International Movies, Thrillers",Thriller,"April 14, 2021",Polish
3,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,"May 27, 2021",TV-PG,,"Children & Family Movies, Dramas, Faith & Spir...",Drama,"May 27, 2021",English
4,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,"May 27, 2021",TV-MA,118 min,"Horror Movies, International Movies, Thrillers",Horror,"May 26, 2021",Thai


In [21]:
# analizamos los valores unicos para cada una de las columnas categóricas
for columna in df_cat.columns:
    print(f"la cantidad de valores únicos para la columna {columna.upper()} son {len(df_cat[columna].unique())} y estos valores son")
    (display(pd.DataFrame(df_cat[columna].value_counts())))
    print("---------------------------------------------")

la cantidad de valores únicos para la columna DIRECTOR son 463 y estos valores son


Unnamed: 0,director
McG,3
Amy Poehler,2
Christopher Guest,2
Noah Baumbach,2
Michael Tiddes,2
...,...
Gina Prince-Bythewood,1
Sue Ding,1
Stéphane de Freitas,1
Stefano Mordini,1


---------------------------------------------
la cantidad de valores únicos para la columna CAST son 422 y estos valores son


Unnamed: 0,cast
Shawn Mendes,2
"Bruce Willis, Kellan Lutz, Gina Carano, D.B. Sweeney, Joshua Mikel, Steve Coulter, Dan Bilzerian, Heather Johansen",1
Loudon Wainwright III,1
"Liam Neeson, James Franco, Tim Blake Nelson, Tom Waits, Zoe Kazan, Brendan Gleeson",1
"Madeline Brewer, Patch Darragh, Melora Walters, Devin Druid, Imani Hakim, Michael Dempsey, Flora Diaz, Samantha Robinson, Jessica Parker Kennedy, Quei Tann",1
...,...
"Marta Etura, Leonardo Sbaraglia, Carlos Librado ""Nene"", Francesc Orella, Imanol Arias, Álvaro Cervantes, Itziar Aizpuru, Benn Northover, Marta Larralde, Alicia Sánchez, Eduardo Rosa, Angel Alkain, Ana Wagener, Paco Tous, Patricia López Arnaiz, Pedro Casablanc",1
"Fulu Mugovhani, Tumi Morake, Bohang Moeko, Yonda Thomas",1
"Nawazuddin Siddiqui, Radhika Apte, Khalid Tyabji, Aditya Srivastava, Padmavati Rao, Shivani Raghuvanshi, Nishant Dahiya, Shweta Tripathi, Gyanendra Tripathi, Shreedhar Dubey, Swanand Kirkire, Riya Shukla, Tigmanshu Dhulia, Ila Arun, Natasha Rastogi",1
Nicolas Anelka,1


---------------------------------------------
la cantidad de valores únicos para la columna COUNTRY son 82 y estos valores son


Unnamed: 0,country
United States,274
India,35
Italy,13
United Kingdom,13
Spain,12
...,...
"Philippines, United States",1
"United Kingdom, Japan, United States",1
"Spain, United Kingdom",1
"United Kingdom, Hungary, Australia",1


---------------------------------------------
la cantidad de valores únicos para la columna DATE_ADDED son 354 y estos valores son


Unnamed: 0,date_added
"October 18, 2019",5
"November 1, 2019",5
"October 30, 2020",4
"April 10, 2020",4
"February 5, 2021",4
...,...
"February 12, 2020",1
"February 14, 2020",1
"March 8, 2020",1
"March 13, 2020",1


---------------------------------------------
la cantidad de valores únicos para la columna RATING son 10 y estos valores son


Unnamed: 0,rating
TV-MA,254
TV-14,94
TV-PG,57
R,47
PG-13,23
TV-G,16
PG,11
TV-Y7,5
TV-Y,5
G,1


---------------------------------------------
la cantidad de valores únicos para la columna DURATION son 87 y estos valores son


Unnamed: 0,duration
98 min,20
90 min,13
107 min,12
113 min,12
104 min,11
...,...
111 min,1
148 min,1
145 min,1
129 min,1


---------------------------------------------
la cantidad de valores únicos para la columna LISTED_IN son 103 y estos valores son


Unnamed: 0,listed_in
Documentaries,69
Dramas,22
"Dramas, International Movies",21
"Documentaries, Music & Musicals",20
Comedies,19
...,...
"Independent Movies, International Movies, Thrillers",1
"Comedies, Dramas, LGBTQ Movies",1
Horror Movies,1
"Children & Family Movies, Comedies, Sci-Fi & Fantasy",1


---------------------------------------------
la cantidad de valores únicos para la columna GENRE son 106 y estos valores son


Unnamed: 0,genre
Documentary,132
Drama,73
Comedy,42
Romantic comedy,35
Thriller,33
...,...
Superhero/Action,1
Dance comedy,1
Animation/Superhero,1
Drama/Horror,1


---------------------------------------------
la cantidad de valores únicos para la columna PREMIERE son 350 y estos valores son


Unnamed: 0,premiere
"October 2, 2020",5
"November 1, 2019",5
"October 18, 2019",5
"April 10, 2020",4
"December 7, 2018",4
...,...
"January 17, 2020",1
"January 1, 2020",1
"December 26, 2019",1
"December 20, 2019",1


---------------------------------------------
la cantidad de valores únicos para la columna LANGUAGE son 37 y estos valores son


Unnamed: 0,language
English,352
Hindi,28
Spanish,26
French,18
Italian,14
Indonesian,9
Portuguese,9
English/Spanish,5
Japanese,5
Korean,5


---------------------------------------------


Al ver todos los valores únicos podemos ver que las columnas de `lister_in` y `genre` nos dan la misma información, así que procedemos a eliminar una de ellas: 

In [22]:
df =  df.drop(['listed_in'],axis=1)
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentary,"October 2, 2020",7.5,English
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",R,82 min,Action,"April 24, 2020",6.7,English
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",,"June 30, 2021",TV-MA,,Thriller,"April 14, 2021",5.7,Polish
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,"May 27, 2021",TV-PG,,Drama,"May 27, 2021",6.7,English
4,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,"May 27, 2021",TV-MA,118 min,Horror,"May 26, 2021",5.2,Thai


## Limpieza `director`, `cast`, `country`


In [23]:
# Revisamos con ejemplo los filtros
df[df['genre']=='Documentary'].head()

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",PG-13,90 min,Documentary,"October 2, 2020",7.5,English
6,Nail Bomber: Manhunt,Daniel Vernon,,,"May 26, 2021",TV-MA,73 min,Documentary,"May 26, 2021",6.3,English
21,Why Did You Kill Me?,Fredrick Munk,,United States,"April 14, 2021",TV-MA,84 min,Documentary,"April 14, 2021",5.6,English
24,Dolly Parton: A MusiCares Tribute,,Dolly Parton,,"April 7, 2021",TV-PG,55 min,Documentary,"April 7, 2021",6.5,English
33,Seaspiracy,Ali Tabrizi,,United States,"March 24, 2021",TV-14,90 min,Documentary,"March 24, 2021",8.2,English


Estas columnas eran las que tenían valores nulos

In [24]:
df.isnull().sum()[df.isnull().sum() > 0]

director     22
cast         91
country      14
duration    237
dtype: int64

In [25]:
# eliminamos los valores nulos de las columnas "director", "cast", "country" por Unknown
df[["director", "cast", "country"]] = df[["director", "cast", "country"]].fillna("Unknown")

In [26]:
df.head()

Unnamed: 0,title,director,cast,country,date_added,rating,duration,genre,premiere,imdb_score,language
0,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,"September 25, 2021",PG-13,90 min,Documentary,"October 2, 2020",7.5,English
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada","September 1, 2021",R,82 min,Action,"April 24, 2020",6.7,English
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",Unknown,"June 30, 2021",TV-MA,,Thriller,"April 14, 2021",5.7,Polish
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,"May 27, 2021",TV-PG,,Drama,"May 27, 2021",6.7,English
4,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,"May 27, 2021",TV-MA,118 min,Horror,"May 26, 2021",5.2,Thai


Imagineos que queremos cambiar el orden de las columnas, cambiemos el orden usando el método `reindex`. 

In [27]:
# creamos una lista con el nuevo orden que queremos
new_order = ['title', 'director', 'cast', 'country', 'language', 'rating', 'genre',
       'premiere', 'date_added', 'duration', 'imdb_score' ]

# aplicamos el método reindex
df = df.reindex(columns=new_order)
df.head(2)

Unnamed: 0,title,director,cast,country,language,rating,genre,premiere,date_added,duration,imdb_score
0,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,English,PG-13,Documentary,"October 2, 2020","September 25, 2021",90 min,7.5
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada",English,R,Action,"April 24, 2020","September 1, 2021",82 min,6.7


# Limpiamos la columna de `duration` 

In [28]:
# lo primero que hacemos es ver los valores únicos
df['duration'].unique()

array(['90 min', '82 min', nan, '118 min', '73 min', '148 min', '140 min',
       '107 min', '98 min', '128 min', '121 min', '142 min', '143 min',
       '84 min', '132 min', '55 min', '112 min', '113 min', '81 min',
       '120 min', '119 min', '138 min', '127 min', '116 min', '32 min',
       '54 min', '8 min', '71 min', '9 min', '48 min', '117 min',
       '83 min', '42 min', '13 min', '124 min', '150 min', '50 min',
       '123 min', '130 min', '110 min', '47 min', '126 min', '104 min',
       '122 min', '28 min', '20 min', '80 min', '135 min', '17 min',
       '41 min', '133 min', '156 min', '149 min', '5 min', '75 min',
       '16 min', '111 min', '85 min', '145 min', '79 min', '25 min',
       '129 min', '137 min', '209 min', '40 min', '141 min', '52 min',
       '22 min', '38 min', '64 min', '10 min', '46 min', '59 min',
       '30 min', '49 min', '60 min', '26 min', '63 min', '74 min',
       '153 min', '35 min', '144 min', '77 min', '12 min', '78 min',
       '53 min', '131 m

In [29]:
# como todos los valores tienen la misma estructura, lo único que tenemos que hacer es quitarle el min. 

df["duration"] = df["duration"].str.split(" ", expand = True).get(0) #get nos permite quedarnos con la primera parte de la lista
df.head()

Unnamed: 0,title,director,cast,country,language,rating,genre,premiere,date_added,duration,imdb_score
0,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,English,PG-13,Documentary,"October 2, 2020","September 25, 2021",90.0,7.5
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada",English,R,Action,"April 24, 2020","September 1, 2021",82.0,6.7
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",Unknown,Polish,TV-MA,Thriller,"April 14, 2021","June 30, 2021",,5.7
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,English,TV-PG,Drama,"May 27, 2021","May 27, 2021",,6.7
4,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,Thai,TV-MA,Horror,"May 26, 2021","May 27, 2021",118.0,5.2


In [30]:
# Recordemos qel tipo de la columna duration
df.dtypes

title          object
director       object
cast           object
country        object
language       object
rating         object
genre          object
premiere       object
date_added     object
duration       object
imdb_score    float64
dtype: object

In [31]:
# la columna "duration" sigue siendo de tipo object, pero esto no tiene sentido, convirtamosla a numérica
df['duration'] = pd.to_numeric(df['duration'], errors='coerce') # errors = "coerce" nos permite convertir los valores que no se puedan convertir a numéricos en NaN

In [32]:
# confirmamos la conversion de los valores
df.dtypes

title          object
director       object
cast           object
country        object
language       object
rating         object
genre          object
premiere       object
date_added     object
duration      float64
imdb_score    float64
dtype: object

In [33]:
# hacemos un filtro sobre la columna duration del df para que nos muestre los nulos 
df[df['duration'].isnull()].head()

Unnamed: 0,title,director,cast,country,language,rating,genre,premiere,date_added,duration,imdb_score
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",Unknown,Polish,TV-MA,Thriller,"April 14, 2021","June 30, 2021",,5.7
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,English,TV-PG,Drama,"May 27, 2021","May 27, 2021",,6.7
5,Baggio: The Divine Ponytail,Letizia Lamartire,"Andrea Arcangeli, Valentina Bellè, Andrea Penn...",Italy,Italian,TV-MA,Biopic,"May 26, 2021","May 26, 2021",,6.2
11,The Woman in the Window,Joe Wright,"Amy Adams, Gary Oldman, Anthony Mackie, Fred H...",United States,English,R,Psychological thriller,"May 14, 2021","May 14, 2021",,5.7
12,Oxygen,Alexandre Aja,"Mélanie Laurent, Mathieu Amalric, Malik Zidi","France, United States",French,TV-14,Science fiction thriller,"May 12, 2021","May 12, 2021",,6.5


In [34]:
# calculamos la mediana de la columna duration y la guardamos en una variable 
mediana_pelis = df['duration'].median()


# reemplazamos los valores nulos de la columna duration por la mediana de la columna 
df['duration'] = df['duration'].fillna(mediana_pelis)

In [35]:
# comprobamois los nulos de la columna duration
df.isnull().sum()



title         0
director      0
cast          0
country       0
language      0
rating        0
genre         0
premiere      0
date_added    0
duration      0
imdb_score    0
dtype: int64

# Limpiamos `date_added` y `premiere`

In [36]:
df.head(2)

Unnamed: 0,title,director,cast,country,language,rating,genre,premiere,date_added,duration,imdb_score
0,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,English,PG-13,Documentary,"October 2, 2020","September 25, 2021",90.0,7.5
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada",English,R,Action,"April 24, 2020","September 1, 2021",82.0,6.7


In [37]:
# convertimos al fecha a datetime
df['date_added'] = pd.to_datetime(df['date_added'])
df['premiere'] = pd.to_datetime(df['premiere'])


In [38]:
# confirmamos la transformación
df.dtypes


title                 object
director              object
cast                  object
country               object
language              object
rating                object
genre                 object
premiere      datetime64[ns]
date_added    datetime64[ns]
duration             float64
imdb_score           float64
dtype: object

In [39]:
df.head(2)

Unnamed: 0,title,director,cast,country,language,rating,genre,premiere,date_added,duration,imdb_score
0,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,English,PG-13,Documentary,2020-10-02,2021-09-25,90.0,7.5
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada",English,R,Action,2020-04-24,2021-09-01,82.0,6.7


## Clean `cast` y `director`


In [40]:
# generamos una copia de nuestro dataframe para poder trabajar con él sin miedo a perder información 
df_act_dire = df.copy()

df_act_dire.head()   

Unnamed: 0,title,director,cast,country,language,rating,genre,premiere,date_added,duration,imdb_score
0,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,English,PG-13,Documentary,2020-10-02,2021-09-25,90.0,7.5
1,Extraction,Steven C. Miller,"Bruce Willis, Kellan Lutz, Gina Carano, D.B. S...","United States, United Kingdom, Canada",English,R,Action,2020-04-24,2021-09-01,82.0,6.7
2,Prime Time,Jakub Piątek,"Bartosz Bielenia, Magdalena Popławska, Andrzej...",Unknown,Polish,TV-MA,Thriller,2021-04-14,2021-06-30,101.0,5.7
3,Blue Miracle,Julio Quintana,"Jimmy Gonzales, Dennis Quaid, Anthony Gonzalez...",United States,English,TV-PG,Drama,2021-05-27,2021-05-27,101.0,6.7
4,Ghost Lab,Paween Purijitpanya,"Thanapob Leeratanakachorn, Paris Intarakomalya...",Thailand,Thai,TV-MA,Horror,2021-05-26,2021-05-27,118.0,5.2


In [41]:
df["cast"].unique().tolist()[1]

'Bruce Willis, Kellan Lutz, Gina Carano, D.B. Sweeney, Joshua Mikel, Steve Coulter, Dan Bilzerian, Heather Johansen'

In [42]:
# lo primero que tenemos que hacer es separar cada ector y actriz

df_act_dire["cast"] = df_act_dire["cast"].str.split(",")
df_act_dire.head(2)

Unnamed: 0,title,director,cast,country,language,rating,genre,premiere,date_added,duration,imdb_score
0,Dick Johnson Is Dead,Kirsten Johnson,[Unknown],United States,English,PG-13,Documentary,2020-10-02,2021-09-25,90.0,7.5
1,Extraction,Steven C. Miller,"[Bruce Willis, Kellan Lutz, Gina Carano, D....","United States, United Kingdom, Canada",English,R,Action,2020-04-24,2021-09-01,82.0,6.7


In [43]:
# separamos cada director en una fila nueva
df_actores = df_act_dire.explode("cast")
df_actores.head(2)

Unnamed: 0,title,director,cast,country,language,rating,genre,premiere,date_added,duration,imdb_score
0,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,English,PG-13,Documentary,2020-10-02,2021-09-25,90.0,7.5
1,Extraction,Steven C. Miller,Bruce Willis,"United States, United Kingdom, Canada",English,R,Action,2020-04-24,2021-09-01,82.0,6.7


Aplicamos la misma lógica para la columna de directores

In [44]:
df["director"].unique().tolist()[16]

'Robert Pulcini, Shari Springer Berman'

In [45]:
df_act_dire["director"] = df_act_dire["director"].str.split(",")
df_act_dire.head(2)

Unnamed: 0,title,director,cast,country,language,rating,genre,premiere,date_added,duration,imdb_score
0,Dick Johnson Is Dead,[Kirsten Johnson],[Unknown],United States,English,PG-13,Documentary,2020-10-02,2021-09-25,90.0,7.5
1,Extraction,[Steven C. Miller],"[Bruce Willis, Kellan Lutz, Gina Carano, D....","United States, United Kingdom, Canada",English,R,Action,2020-04-24,2021-09-01,82.0,6.7


In [46]:
# separamos cada actor en una fila nueva
df_directores = df_act_dire.explode("director")
df_directores.head(3)

Unnamed: 0,title,director,cast,country,language,rating,genre,premiere,date_added,duration,imdb_score
0,Dick Johnson Is Dead,Kirsten Johnson,[Unknown],United States,English,PG-13,Documentary,2020-10-02,2021-09-25,90.0,7.5
1,Extraction,Steven C. Miller,"[Bruce Willis, Kellan Lutz, Gina Carano, D....","United States, United Kingdom, Canada",English,R,Action,2020-04-24,2021-09-01,82.0,6.7
2,Prime Time,Jakub Piątek,"[Bartosz Bielenia, Magdalena Popławska, Andr...",Unknown,Polish,TV-MA,Thriller,2021-04-14,2021-06-30,101.0,5.7


## Limpiamos `genre` 


In [47]:
# vemos los valores únicos
df['genre'].unique().tolist()

['Documentary',
 'Action',
 'Thriller',
 'Drama',
 'Horror',
 'Biopic',
 'Zombie/Heist',
 'Comedy',
 'Crime drama',
 'Psychological thriller',
 'Science fiction thriller',
 'Animated musical comedy',
 'Psychological thriller drama',
 'Superhero-Comedy',
 'Romantic comedy',
 'Christian musical',
 'Hidden-camera prank comedy',
 'Comedy-drama',
 'Romantic teen drama',
 'Romantic drama',
 'Science fiction',
 'Action/Science fiction',
 'Animation / Short',
 'Superhero',
 'Aftershow / Interview',
 'Musical',
 'Animation',
 'Concert Film',
 'Christmas comedy',
 'Stop Motion',
 'Family/Christmas musical',
 'Anthology/Dark comedy',
 'Mystery',
 'Romantic comedy/Holiday',
 'Variety show',
 'Animation/Musical/Adventure',
 'Romantic thriller',
 'Comedy/Fantasy/Family',
 'Horror comedy',
 'Action comedy',
 'Family',
 'Comedy/Horror',
 'Drama/Horror',
 'Animation/Superhero',
 'Dance comedy',
 'Superhero/Action',
 'Romantic teenage drama',
 'Musical comedy',
 'Family/Comedy-drama',
 'Romance',
 'Anim

In [48]:
# para facilitar la limpieza lo primero qye hacemos es poner todos los géneros en minúscula
df['genre'] = df['genre'].str.lower()


In [49]:
# comprobamos que están en minúsculas
print(df['genre'].unique().tolist())

['documentary', 'action', 'thriller', 'drama', 'horror', 'biopic', 'zombie/heist', 'comedy', 'crime drama', 'psychological thriller', 'science fiction thriller', 'animated musical comedy', 'psychological thriller drama', 'superhero-comedy', 'romantic comedy', 'christian musical', 'hidden-camera prank comedy', 'comedy-drama', 'romantic teen drama', 'romantic drama', 'science fiction', 'action/science fiction', 'animation / short', 'superhero', 'aftershow / interview', 'musical', 'animation', 'concert film', 'christmas comedy', 'stop motion', 'family/christmas musical', 'anthology/dark comedy', 'mystery', 'romantic comedy/holiday', 'variety show', 'animation/musical/adventure', 'romantic thriller', 'comedy/fantasy/family', 'horror comedy', 'action comedy', 'family', 'comedy/horror', 'drama/horror', 'animation/superhero', 'dance comedy', 'superhero/action', 'romantic teenage drama', 'musical comedy', 'family/comedy-drama', 'romance', 'anime/fantasy', 'war drama', 'heist film/thriller', 'a

In [50]:
#recordamos que anteriormente vimos que la cantidad de valores únicos para la columna GENRE eran 106 
# a continuacion vemos un df con el % de cada uno de los géneros que tenemos en la columna genre
pd.DataFrame( df['genre'].value_counts()/len(df)*100).head(40)

Unnamed: 0,genre
documentary,25.730994
drama,14.230019
comedy,8.187135
romantic comedy,6.822612
thriller,6.432749
comedy-drama,2.729045
crime drama,1.949318
horror,1.754386
biopic,1.559454
action,1.364522


In [51]:
# usando la librería Counter, calculamos los géneros más comunes.
cuenta_generos = Counter(genres for genres in df['genre'])
print(f"Hay {len(cuenta_generos)} generos diferentes.") 

Hay 106 generos diferentes.


In [52]:
# creamos un diccionario con los 10 géneros más comunes y sus respectivas frecuencias 
comunes = dict(cuenta_generos.most_common(10))
comunes 

{'documentary': 132,
 'drama': 73,
 'comedy': 42,
 'romantic comedy': 35,
 'thriller': 33,
 'comedy-drama': 14,
 'crime drama': 10,
 'horror': 9,
 'biopic': 8,
 'action': 7}

In [53]:
fuzz.ratio('showcocina', "documentary") 

38

In [54]:
def generos_(col, generos_comunes):

    '''
    Esta función nos permite asignar un género a cada película o serie en función de los géneros más comunes. 
    args: 
        col: columna de la que queremos extraer el género
        generos_comunes: diccionario con los géneros más comunes
    return: 
        genero: género asignado a la película o serie

    '''
    
    maximo = 0
    for key in generos_comunes.keys():
        parecido = fuzz.ratio(col, key) # hacemos 
        #print(parecido)
        if parecido > maximo:
            maximo = parecido
            
            genero = key
            
    # nos aseguramos de que los géneros se parezcan en al menos un 50%        
    if maximo > 50:
        return genero
    else:
        return "Other"
            

In [55]:
df.apply(lambda x: generos_(x["genre"], comunes), axis = 1)

0      documentary
1           action
2         thriller
3            drama
4           horror
          ...     
508    documentary
509    documentary
510    documentary
511    documentary
512    documentary
Length: 513, dtype: object

In [56]:
# creamos una nueva columna con los géneros más comunes 
df["genre2"] = df.apply(lambda x: generos_(x["genre"], comunes), axis = 1)

In [57]:
# comprobamos que se haya creado correctamente la nueva columna 
df["genre2"].value_counts()

documentary        133
drama               78
Other               68
romantic comedy     62
comedy              52
thriller            51
comedy-drama        19
crime drama         17
action              14
horror              11
biopic               8
Name: genre2, dtype: int64

Como se puede observar esta, transformacion de la columna genre2 presenta valores muy similares como `comedy-drama` y `crime drama`. Esto podria ser depurado para mejorar el producto en futuras tareas dentro del proyecto. 


In [58]:
# generamos un filtro para ver los cambios 
df[df['genre2']== "Other"]

Unnamed: 0,title,director,cast,country,language,rating,genre,premiere,date_added,duration,imdb_score,genre2
7,Army of the Dead,Zack Snyder,"Dave Bautista, Ella Purnell, Omari Hardwick, G...",United States,English,R,zombie/heist,2021-05-21,2021-05-21,148.0,5.9,Other
12,Oxygen,Alexandre Aja,"Mélanie Laurent, Mathieu Amalric, Malik Zidi","France, United States",French,TV-14,science fiction thriller,2021-05-12,2021-05-12,101.0,6.5,Other
29,A Week Away,Roman White,"Kevin Quinn, Bailee Madison, Jahbril Cook, Kat...",United States,English,TV-PG,christian musical,2021-03-26,2021-03-26,98.0,5.7,Other
30,Bad Trip,Unknown,"Eric André, Lil Rel Howery, Tiffany Haddish, M...",United States,English,TV-MA,hidden-camera prank comedy,2021-03-26,2021-03-26,101.0,6.6,Other
47,Space Sweepers,Jo Sung-hee,"Song Joong-ki, Kim Tae-ri, Jin Sun-kyu, Yoo Ha...",South Korea,Korean,TV-MA,science fiction,2021-02-05,2021-02-05,138.0,6.6,Other
...,...,...,...,...,...,...,...,...,...,...,...,...
485,The Siege of Jadotville,Richie Smyth,"Jamie Dornan, Guillaume Canet, Emmanuelle Seig...","Ireland, South Africa",English,TV-MA,war,2016-10-07,2016-10-07,101.0,7.2,Other
488,ARQ,Tony Elliott,"Robbie Amell, Rachael Taylor, Shaun Benson, Gr...","Canada, United States",English,TV-MA,science fiction/thriller,2016-09-16,2016-09-16,101.0,6.4,Other
499,Special Correspondents,Ricky Gervais,"Eric Bana, Ricky Gervais, Vera Farmiga, Kelly ...","Canada, United Kingdom, United States",English,TV-MA,satire,2016-04-29,2016-04-29,101.0,5.8,Other
502,Pee-wee's Big Holiday,John Lee,"Paul Reubens, Joe Manganiello, Jessica Pohly, ...",United States,English,TV-PG,adventure,2016-03-18,2016-03-18,90.0,6.1,Other


# Top 10 directores y actores

In [59]:

df_actores

Unnamed: 0,title,director,cast,country,language,rating,genre,premiere,date_added,duration,imdb_score
0,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,English,PG-13,Documentary,2020-10-02,2021-09-25,90.0,7.5
1,Extraction,Steven C. Miller,Bruce Willis,"United States, United Kingdom, Canada",English,R,Action,2020-04-24,2021-09-01,82.0,6.7
1,Extraction,Steven C. Miller,Kellan Lutz,"United States, United Kingdom, Canada",English,R,Action,2020-04-24,2021-09-01,82.0,6.7
1,Extraction,Steven C. Miller,Gina Carano,"United States, United Kingdom, Canada",English,R,Action,2020-04-24,2021-09-01,82.0,6.7
1,Extraction,Steven C. Miller,D.B. Sweeney,"United States, United Kingdom, Canada",English,R,Action,2020-04-24,2021-09-01,82.0,6.7
...,...,...,...,...,...,...,...,...,...,...,...
508,Tig,"Kristina Goolsby, Ashley York",Tig Notaro,United States,English,TV-14,Documentary,2015-07-17,2015-07-17,101.0,7.4
509,"What Happened, Miss Simone?",Liz Garbus,Unknown,United States,English,TV-14,Documentary,2015-06-26,2015-06-26,101.0,7.6
510,Hot Girls Wanted,"Jill Bauer, Ronna Gradus",Unknown,United States,English,TV-MA,Documentary,2015-05-29,2015-05-29,83.0,6.1
511,The Other One: The Long Strange Trip of Bob Weir,Mike Fleiss,Bob Weir,United States,English,TV-14,Documentary,2015-05-22,2015-05-22,84.0,7.3


In [60]:
top_actores = df_actores['cast'].value_counts() .reset_index()
top_actores.head(10)

Unnamed: 0,index,cast
0,Unknown,91
1,Adam Sandler,7
2,Maya Rudolph,6
3,Andrew Bachelor,5
4,Lakeith Stanfield,4
5,Jacki Weaver,4
6,Ken Marino,4
7,Robbie Amell,4
8,Zachary Quinto,4
9,Nick Swardson,4


In [61]:
lista_top_actores = top_actores['index'][1:11].tolist()
lista_top_actores

['Adam Sandler',
 ' Maya Rudolph',
 ' Andrew Bachelor',
 ' Lakeith Stanfield',
 ' Jacki Weaver',
 ' Ken Marino',
 ' Robbie Amell',
 ' Zachary Quinto',
 ' Nick Swardson',
 ' Rob Schneider']

In [62]:
for actor in lista_top_actores:
    print(actor.strip())

Adam Sandler
Maya Rudolph
Andrew Bachelor
Lakeith Stanfield
Jacki Weaver
Ken Marino
Robbie Amell
Zachary Quinto
Nick Swardson
Rob Schneider


In [63]:
lista_top_actores = [actor.strip() for actor in lista_top_actores]
lista_top_actores

['Adam Sandler',
 'Maya Rudolph',
 'Andrew Bachelor',
 'Lakeith Stanfield',
 'Jacki Weaver',
 'Ken Marino',
 'Robbie Amell',
 'Zachary Quinto',
 'Nick Swardson',
 'Rob Schneider']

In [64]:
df_directores.head(2)

Unnamed: 0,title,director,cast,country,language,rating,genre,premiere,date_added,duration,imdb_score
0,Dick Johnson Is Dead,Kirsten Johnson,[Unknown],United States,English,PG-13,Documentary,2020-10-02,2021-09-25,90.0,7.5
1,Extraction,Steven C. Miller,"[Bruce Willis, Kellan Lutz, Gina Carano, D....","United States, United Kingdom, Canada",English,R,Action,2020-04-24,2021-09-01,82.0,6.7


In [65]:
top_directores = df_directores['director'].value_counts() .reset_index()
top_directores.head(10)

Unnamed: 0,index,director
0,Unknown,22
1,McG,3
2,Blair Simmons,2
3,John Schultz,2
4,Noah Baumbach,2
5,Karan Johar,2
6,Dibakar Banerjee,2
7,Julien Leclercq,2
8,Amy Poehler,2
9,Kyle Newacheck,2


In [66]:
lista_top_directores = top_directores['index'][1:11].tolist()
lista_top_directores

['McG',
 'Blair Simmons',
 'John Schultz',
 'Noah Baumbach',
 ' Karan Johar',
 ' Dibakar Banerjee',
 'Julien Leclercq',
 'Amy Poehler',
 'Kyle Newacheck',
 'Peter Sullivan']

In [67]:

lista_top_directores = [director.strip() for director in lista_top_directores]
lista_top_directores

['McG',
 'Blair Simmons',
 'John Schultz',
 'Noah Baumbach',
 'Karan Johar',
 'Dibakar Banerjee',
 'Julien Leclercq',
 'Amy Poehler',
 'Kyle Newacheck',
 'Peter Sullivan']

# Guardado

In [68]:
with open('data/top_ten_actor.pkl', 'wb') as dire:
    pickle.dump(lista_top_directores, dire)    

In [69]:
with open('data/top_ten_director.pkl', 'wb') as acto:
    pickle.dump(lista_top_actores, acto) 