# Bibliotecas

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

In [47]:
# Criando dois dataframe com os tipos "TV Show" e "Movie" separados
netflix_shows = df_netflix[df_netflix["type"]=="TV Show"]
netflix_movies = df_netflix[df_netflix["type"]=="Movie"]

# TF-IDF

## Primeiros passos

In [22]:
# Algumas frases para entender o funcionamento do algoritmo
text = ["this is the first document",
        "this document is the second document",
        "this is the third document",
        "this is the fourth document"
       ]

In [40]:
# Chamando a classe e criando a matriz ft-idf
vectorizer = TfidfVectorizer(stop_words="english")
matrix = vectorizer.fit_transform(text)
print("As n palavras extraídas", vectorizer.get_feature_names_out())

As n palavras extraídas ['document' 'fourth' 'second']


In [41]:
# Verificando a posição de cada palavra na matriz
vectorizer.vocabulary_

{'document': 0, 'second': 2, 'fourth': 1}

In [43]:
# Valor de cada palavra na matriz tf-idf. Ele atribuiu menor peso para as palavras que apareceram com mais frequência (ladrão e peso);
# Cada linha corresponde as palavras ('document', 'fourth', 'second') e cada coluna a um documento da lista text.
print(matrix.toarray())

[[1.         0.         0.        ]
 [0.722056   0.         0.69183461]
 [1.         0.         0.        ]
 [0.46263733 0.88654763 0.        ]]


## Construindo uma filtragem baseada em conteúdo 

### Carregando e limpando a base de dados

In [77]:
df_netflix = pd.read_csv("dataset/netflix_titles.csv")
df_netflix.head(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [46]:
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [48]:
# Tratando os valores nulos
df_netflix["director"] = df_netflix["director"].fillna("Unknown")
df_netflix["cast"] = df_netflix["cast"].fillna("Unknown")
df_netflix["cast"] = df_netflix["cast"].fillna("No description available!")
df_netflix["cast"] = df_netflix["cast"].fillna("Uncategorized")

In [51]:
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [50]:
# Removendo duplicatas
df_netflix.drop_duplicates(subset="title", inplace=True)

In [52]:
# Consolidando as informações das colunas "description", "cast", "director" e "listed_in" em uma única coluna.
df_netflix["combined_features"] = (df_netflix["description"]+" "+
                                   df_netflix["cast"]+" "+
                                   df_netflix["director"]+" "+
                                   df_netflix["listed_in"]
                                  )
df_netflix

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,combined_features
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...","As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,Unknown,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,Unknown,Unknown,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,Unknown,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a...","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,Unknown,Unknown,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g...","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...,Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero...","Dragged from civilian life, a former superhero..."


In [58]:
# Normalização do texto
df_netflix["combined_features"] = df_netflix["combined_features"].apply(lambda x: re.sub(r"[^\w\s]", "", x.lower()))

In [60]:
df_netflix[["title", "combined_features"]].head()

Unnamed: 0,title,combined_features
0,Dick Johnson Is Dead,as her father nears the end of his life filmma...
1,Blood & Water,after crossing paths at a party a cape town te...
2,Ganglands,to protect his family from a powerful drug lor...
3,Jailbirds New Orleans,feuds flirtations and toilet talk go down amon...
4,Kota Factory,in a city of coaching centers known to train i...


### Gerando a matrix TF-IDF

In [62]:
tfidf = TfidfVectorizer(stop_words = "english")
# (stop_words = "english") -> remove as palavras irrelevantes da matriz;

In [63]:
matrix = tfidf.fit_transform(df_netflix["combined_features"])
# transforma o texto da coluna "combined_features" em uma matriz esparsa

In [68]:
matrix.shape
# Cada linha da matriz TF-IDF corresponde a um filme/série;
# Cada coluna representa uma palavra única no dataset;
# O valor em cada célula é o peso TF-IDF dessa palavra para o filme correspondente.

(8807, 53119)

### Calculando a Similaridade

In [71]:
cosine_sim = cosine_similarity(matrix, matrix)
# cosine_sin[i][j] indica a similaridade entre o filme i e o filme j;
# os valores variam entre 0 (sem semelhança) e 1 (completamente semelhante).

### Função de recomendação

In [82]:
def get_recommendations(title, cosine_sim=cosine_sim, df=df_netflix):
    # Obter o índice do filme a partir do título
    indices = pd.Series(df_netflix.index, index=df_netflix["title"]).drop_duplicates()
    idx = indices[title]

    # Para obter a similaridade do filme fornecido com todos os outros
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Ordenar os filmes com base na similaridade, do mais parecido ao menos
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Para selecionar os índices dos 5 filmes mais similares
    sim_scores= sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]

    # Retornar os títulos dos filmes mais similares
    return df_netflix["title"].iloc[movie_indices]

### Testando o sistema de recomendação

In [78]:
pd.set_option('display.max_rows', None)
df_netflix["title"]

0                                    Dick Johnson Is Dead
1                                           Blood & Water
2                                               Ganglands
3                                   Jailbirds New Orleans
4                                            Kota Factory
5                                           Midnight Mass
6                        My Little Pony: A New Generation
7                                                 Sankofa
8                           The Great British Baking Show
9                                            The Starling
10                    Vendetta: Truth, Lies and The Mafia
11                                       Bangkok Breaking
12                                           Je Suis Karl
13                       Confessions of an Invisible Girl
14                        Crime Stories: India Detectives
15                                      Dear White People
16      Europe's Most Dangerous Man: Otto Skorzeny in ...
17            

In [92]:
get_recommendations("Elize Matsunaga: Once Upon a Crime")

2085                          I AM A KILLER: RELEASED
2548                                   Trial By Media
1196    Under Suspicion: Uncovering the Wesphael Case
1586                        Room 2806: The Accusation
7340                     Los tiempos de Pablo Escobar
Name: title, dtype: object

In [93]:
df_netflix.loc[[490, 2085, 2548, 1196, 7340]]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
490,s491,TV Show,Elize Matsunaga: Once Upon a Crime,,,,"July 8, 2021",2021,TV-MA,1 Season,"Crime TV Shows, Docuseries, International TV S...",The crime shocked Brazil: Elize Matsunaga shot...
2085,s2086,TV Show,I AM A KILLER: RELEASED,,,"United Kingdom, United States","August 28, 2020",2020,TV-MA,1 Season,"British TV Shows, Crime TV Shows, Docuseries","In this crime docuseries spinoff, a convict is..."
2548,s2549,TV Show,Trial By Media,,,United States,"May 11, 2020",2020,TV-MA,1 Season,"Crime TV Shows, Docuseries","In this true crime docuseries, some of the mos..."
1196,s1197,TV Show,Under Suspicion: Uncovering the Wesphael Case,Alain Brunard,,Belgium,"March 17, 2021",2021,TV-14,1 Season,"Crime TV Shows, Docuseries, International TV S...",This docuseries follows the high-profile case ...
7340,s7341,TV Show,Los tiempos de Pablo Escobar,Alessandro Angulo,,Colombia,"August 1, 2018",2012,TV-14,1 Season,"Crime TV Shows, Docuseries, International TV S...","Featuring never-before-seen images, personal t..."


In [94]:
get_recommendations("Kung Fu Panda")

7241                      Kung Fu Panda: Holiday
574                              Kung Fu Panda 2
5857        Kung Fu Panda: Secrets of the Scroll
3691    DreamWorks Kung Fu Panda Awesome Secrets
178                                The Interview
Name: title, dtype: object

In [95]:
df_netflix.loc[[573, 574, 7241, 5857, 3691, 178]]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
573,s574,Movie,Kung Fu Panda,"John Stevenson, Mark Osborne","Jack Black, Dustin Hoffman, Angelina Jolie, Ia...","United States, China","July 1, 2021",2008,PG,94 min,"Children & Family Movies, Comedies",When a powerful villain comes after peace in h...
574,s575,Movie,Kung Fu Panda 2,Jennifer Yuh Nelson,"Jack Black, Angelina Jolie, Dustin Hoffman, Ga...","United States, China","July 1, 2021",2011,PG,93 min,"Children & Family Movies, Comedies","With his fists up and belly full, Po embarks o..."
7241,s7242,Movie,Kung Fu Panda: Holiday,Tim Johnson,"Jack Black, Angelina Jolie, Dustin Hoffman, Ja...",United States,"December 1, 2012",2010,TV-PG,26 min,"Children & Family Movies, Comedies","As preparations for the Winter Feast build, Po..."
5857,s5858,Movie,Kung Fu Panda: Secrets of the Scroll,Rodolphe Guenoden,"Jack Black, Dustin Hoffman, Seth Rogen, David ...",United States,"March 25, 2016",2016,TV-PG,23 min,"Children & Family Movies, Comedies",When a twist of fate brings five unlikely anim...
3691,s3692,TV Show,DreamWorks Kung Fu Panda Awesome Secrets,,"Jack Black, Dustin Hoffman, Dennis Haysbert, P...",United States,"July 1, 2019",2008,TV-PG,1 Season,"Kids' TV, TV Action & Adventure, TV Comedies","In this pair of adventures, Po tells the story..."
178,s179,Movie,The Interview,"Evan Goldberg, Seth Rogen","James Franco, Seth Rogen, Lizzy Caplan, Randal...",United States,"September 1, 2021",2014,R,112 min,"Action & Adventure, Comedies",Seth Rogen and James Franco star in this provo...


In [96]:
get_recommendations("Mortal Kombat")

1353         Beverly Hills Ninja
7857    Resident Evil: Afterlife
1459                Mean Girls 2
6711               Event Horizon
8196              The Art of War
Name: title, dtype: object

In [98]:
df_netflix.loc[[1353, 7857, 1459, 6711, 8196]]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
1353,s1354,Movie,Beverly Hills Ninja,Dennis Dugan,"Chris Farley, Nicollette Sheridan, Robin Shou,...",United States,"February 1, 2021",1997,PG-13,89 min,"Action & Adventure, Comedies","Raised by ninjas, a big-hearted but bumbling o..."
7857,s7858,Movie,Resident Evil: Afterlife,Paul W.S. Anderson,"Milla Jovovich, Ali Larter, Kim Coates, Shawn ...","Germany, France, United States, Canada, United...","January 1, 2020",2010,R,97 min,"Action & Adventure, Horror Movies, Sci-Fi & Fa...",The Undead Apocalypse continues as super-soldi...
1459,s1460,Movie,Mean Girls 2,Melanie Mayron,"Meaghan Martin, Maiara Walsh, Jennifer Stone, ...",United States,"January 1, 2021",2011,PG-13,98 min,Comedies,The father of a high school pariah offers to p...
6711,s6712,Movie,Event Horizon,Paul W.S. Anderson,"Laurence Fishburne, Sam Neill, Kathleen Quinla...","United Kingdom, United States","January 1, 2020",1997,R,96 min,"Horror Movies, Sci-Fi & Fantasy",After a signal is received from a long-missing...
8196,s8197,Movie,The Art of War,Christian Duguay,"Wesley Snipes, Anne Archer, Maury Chaykin, Mar...","United States, Canada","July 1, 2020",2000,R,117 min,Action & Adventure,"Framed for the murder of an ambassador, a form..."


# Combinando os datesets para a segunda versão do aplicativo

In [21]:
# Carregar os datasets (substitua pelos caminhos reais)
df_netflix = pd.read_csv("dataset/netflix_titles.csv")
df_disney = pd.read_csv("dataset/disney_plus_titles.csv")
df_amazon = pd.read_csv("dataset/amazon_prime_titles.csv")
df_hulu = pd.read_csv("dataset/hulu_titles.csv")

In [22]:
# Adicionar uma coluna indicando a plataforma
df_netflix["platform"] = "Netflix"
df_disney["platform"] = "Disney+"
df_amazon["platform"] = "Amazon Prime"
df_hulu["platform"] = "Hulu"

In [23]:
# Concatenar os datasets
df = pd.concat([df_netflix, df_disney, df_amazon, df_hulu], ignore_index=True)

In [24]:
# Preenchendo as colunas que estão em branco
df["director"] = df["director"].fillna("Unknown")
df["cast"] = df["cast"].fillna("Unknown")
df["description"] = df["description"].fillna("No description available!")
df["listed_in"] = df["listed_in"].fillna("Uncategorized")

In [25]:
# Removendo duplicatas
df.drop_duplicates(subset="title", inplace=True)

In [26]:
# Consolidando as informações das colunas "description", "cast", "director" e "listed_in" em uma única coluna.
df["combined_features"] = (df["description"]+" "+
                                df["cast"]+" "+
                                df["director"]+" "+
                                df["listed_in"]
                          )

In [28]:
df = df[["title", "combined_features", "platform"]]

In [29]:
df

Unnamed: 0,title,combined_features,platform
0,Dick Johnson Is Dead,"As her father nears the end of his life, filmm...",Netflix
1,Blood & Water,"After crossing paths at a party, a Cape Town t...",Netflix
2,Ganglands,To protect his family from a powerful drug lor...,Netflix
3,Jailbirds New Orleans,"Feuds, flirtations and toilet talk go down amo...",Netflix
4,Kota Factory,In a city of coaching centers known to train I...,Netflix
...,...,...,...
22988,Samurai Harem,His skill with a sword is magnificent but his ...,Hulu
22989,Scream Queens,SCREAM QUEENS is a new genre-bending comedy-ho...,Hulu
22993,Star Trek: The Original Series,The 23rd century adventures of Captain James T...,Hulu
22996,The Twilight Zone,Rod Serling's seminal anthology series focused...,Hulu


In [30]:
df.to_csv("system_recomendation.csv", index = False)