# Trabalho Prático 1 de Ciência de Dados

## Download e leitura

#### Download dos dados

In [37]:
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1AeYgV89TmYvNC__RDXr8hS0P6WOsChWg' -O filmes.csv

--2025-02-11 09:18:43--  https://docs.google.com/uc?export=download&id=1AeYgV89TmYvNC__RDXr8hS0P6WOsChWg
Resolving docs.google.com (docs.google.com)... 2800:3f0:4004:80b::200e, 142.251.133.174
Connecting to docs.google.com (docs.google.com)|2800:3f0:4004:80b::200e|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1AeYgV89TmYvNC__RDXr8hS0P6WOsChWg&export=download [following]
--2025-02-11 09:18:44--  https://drive.usercontent.google.com/download?id=1AeYgV89TmYvNC__RDXr8hS0P6WOsChWg&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 2800:3f0:4004:810::2001, 142.251.129.65
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|2800:3f0:4004:810::2001|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11380332 (11M) [application/octet-stream]
Saving to: ‘filmes.csv’


2025-02-11 09:18:52 (5,08 MB/s) - ‘filmes.csv’ saved [11380332/1

#### Leitura dos dados

In [1]:
import pandas as pd

filmes_df = pd.read_csv("filmes.csv")
filmes_df.head()

Unnamed: 0,genres,averageRating,numVotes,sinopse,primaryTitle,startYear,runtimeMinutes,actors_names,directors_names
0,Family,7.1,387992,When two kids find and play a magical board ga...,Jumanji,1995,104,['Robin Williams' 'Jonathan Hyde' 'Jonathan Hy...,['Joe Johnston']
1,Romance,6.7,30265,John and Max resolve to save their beloved bai...,Grumpier Old Men,1995,101,['Walter Matthau' 'Jack Lemmon' 'Burgess Mered...,['Howard Deutch']
2,Romance,6.0,12585,"Based on Terry McMillan's novel, this film fol...",Waiting to Exhale,1995,124,['Gregory Hines' 'Dennis Haysbert' 'Mykelti Wi...,['Forest Whitaker']
3,"Romance,Family",6.1,42555,George Banks must deal not only with his daugh...,Father of the Bride Part II,1995,106,['Steve Martin' 'Martin Short' 'George Newbern...,['Charles Shyer']
4,"Crime,Action",8.3,738636,A group of high-end professional thieves start...,Heat,1995,170,['Al Pacino' 'Robert De Niro' 'Val Kilmer' 'Jo...,['Michael Mann']


## Pré-processamento dos dados

In [2]:
import nltk
nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to /home/miguel/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
from nltk import sent_tokenize, word_tokenize

#### Extrair sinopse do dataframe

In [4]:
sinopses = filmes_df.sinopse
sinopses.head()

0    When two kids find and play a magical board ga...
1    John and Max resolve to save their beloved bai...
2    Based on Terry McMillan's novel, this film fol...
3    George Banks must deal not only with his daugh...
4    A group of high-end professional thieves start...
Name: sinopse, dtype: object

In [5]:
sinopses.shape

(30358,)

#### Convertendo os tokens

In [6]:
!pip install unidecode



In [7]:
import string
import unidecode

Convertendo todos os tokens para letras minúsculas

In [8]:
def tokens_low(tokens):
  return [w.lower() for w in tokens]

Removendo os símbolos de pontuação

In [9]:
table = str.maketrans('', '', string.punctuation)

In [10]:
def tokens_pontuacao(tokens):
  return [w.translate(table) for w in tokens]

Convertendo carcteres especiais

In [11]:
def tokens_char_especical(tokens):
  return [unidecode.unidecode(w).lower() for w in tokens]

#### Filtrando apenas palavras

Extraindo apenas as palavras da lista de tokens

In [12]:
def tokens_apenas_words(tokens):
  return [word for word in tokens if word.isalpha()]

Removendo stop words

In [13]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/miguel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
stop_words = set(stopwords.words('english'))

In [15]:
def tokens_stop_words(tokens):
  return [word for word in tokens if not word in stop_words]

#### Stemming

In [16]:
from nltk.stem.porter import PorterStemmer

In [17]:
porter = PorterStemmer()

In [18]:
def tokens_stemming(tokens):
  return [porter.stem(word) for word in tokens]

#### Tokenizer Personalizado

In [19]:
def tokenizer_personalizado(texto):
  tokens = word_tokenize(texto)

  tokens = tokens_low(tokens)
  tokens = tokens_pontuacao(tokens)
  tokens = tokens_char_especical(tokens)
  tokens = tokens_apenas_words(tokens)
  tokens = tokens_stop_words(tokens)
  tokens = tokens_stemming(tokens)

  return tokens

## Construção da matriz de TF-IDF

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
# Cada linha do array é uma sinopse
D = [sin for sin in filmes_df.sinopse]

### Usando Tokenizer Personalizado

In [22]:
vectorizer = TfidfVectorizer(tokenizer=tokenizer_personalizado)
X = vectorizer.fit_transform(D)



In [23]:
X.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
vectorizer.get_feature_names_out()

array(['aa', 'aadhavan', 'aadhi', ..., 'zylberberg', 'zylberstein',
       'zyto'], dtype=object)

In [25]:
df = pd.DataFrame(X.todense(), columns = vectorizer.get_feature_names_out())
df

Unnamed: 0,aa,aadhavan,aadhi,aakansha,aakash,aaliya,aalst,aaltonen,aamir,aang,aarno,aaron,aarti,aarusaami,aaryan,aback,aballay,abandon,abarnabel,abatantuono,abb,abba,abberlin,abbess,abbey,abbi,abbot,abbott,abbrevi,abc,abdel,abdi,abdic,abdomen,abduct,abducte,abductor,abductorkil,abdul,abe,...,zootechnician,zoozi,zoppo,zor,zoran,zordon,zorel,zorg,zorin,zorina,zork,zorn,zoroark,zorro,zosia,zou,zouri,zoya,zsolt,zu,zubeida,zubeidaa,zuckerberg,zukhra,zuko,zulmiro,zulu,zum,zurich,zurikela,zuzu,zvezda,zweig,zydruna,zyga,zygon,zyl,zylberberg,zylberstein,zyto
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Usando o Padrão da Bibilioteca

In [26]:
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(D)

In [27]:
# X.todense()

In [28]:
# vectorizer.get_feature_names_out()

In [29]:
# df = pd.DataFrame(X.todense(), columns = vectorizer.get_feature_names_out())
# df

## Redução de dimensionalidade

In [None]:
from sklearn.decomposition import TruncatedSVD

In [61]:
svd = TruncatedSVD(n_components=1000)
X_REDUCED = svd.fit_transform(X)

In [45]:
y = filmes_df.genres
y

0                Family
1               Romance
2               Romance
3        Romance,Family
4          Crime,Action
              ...      
30353         Biography
30354         Biography
30355      Crime,Horror
30356           Romance
30357            Action
Name: genres, Length: 30358, dtype: object

In [46]:
genres_types = []

for g in y:
    if g not in genres_types:
        genres_types.append(g)

In [47]:
df_reduced = pd.DataFrame(X_REDUCED)
df_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0.136516,-0.025075,0.009145,-0.012148,0.019862,0.019905,-0.111566,-0.004848,-0.068118,-0.038359,0.098955,0.048759,0.073587,-0.061886,0.012190,-0.064609,0.066178,-0.029975,0.028165,0.021257,0.043241,0.008901,0.032041,0.008734,0.015121,0.000494,-0.057191,-0.012077,0.015632,0.031535,-0.011964,-0.008232,0.022766,0.008493,0.012462,0.004413,-0.017054,-0.035366,0.019444,-0.037123,...,0.016044,-0.019575,0.012538,0.021564,0.005525,-0.024682,0.006298,0.024673,0.002728,0.016071,-0.005771,0.029245,0.022977,-0.082333,-0.024621,-0.007539,-0.016589,-0.008046,-0.024337,-0.020270,-0.049304,0.009275,0.024453,0.019119,-0.012254,-0.006100,0.024970,-0.010298,-0.000430,-0.015465,-0.066068,0.059108,-0.025586,0.029435,0.023002,0.019333,-0.061731,-0.071202,0.074322,0.018060
1,0.064629,-0.026652,-0.018950,-0.012951,-0.028979,-0.042308,0.064509,0.023370,-0.048313,-0.019525,-0.008945,0.035210,-0.004412,0.004894,0.015475,-0.004881,-0.004797,0.009061,-0.021242,-0.001821,0.019131,-0.010068,0.001624,-0.010647,0.023697,-0.007276,-0.026101,0.024776,-0.008907,-0.015323,-0.005603,-0.011453,-0.001848,-0.001306,-0.002086,0.002536,-0.001361,0.012821,-0.020812,-0.020498,...,-0.012710,0.031054,-0.013179,-0.002644,-0.007833,0.008876,-0.000974,-0.012386,0.014869,0.022507,-0.031487,0.000494,0.025315,-0.002813,0.041214,-0.032981,0.044618,0.000879,-0.021693,0.000646,-0.032159,-0.001903,-0.021348,-0.008376,-0.011258,0.025443,-0.025089,0.014838,-0.024785,0.013966,-0.001695,-0.013042,-0.016216,0.044695,0.000887,-0.014827,0.052334,-0.014949,-0.045498,-0.021335
2,0.075667,0.006524,-0.053301,0.012716,0.042342,0.020595,-0.014383,0.037617,0.075698,-0.048810,-0.033375,0.046279,0.027344,-0.032586,-0.005559,0.075075,0.041843,-0.019517,0.010114,0.012139,0.016058,-0.037598,-0.010685,0.028574,0.013355,-0.019230,0.002198,-0.015054,-0.032959,-0.034756,-0.012617,0.035574,-0.004377,-0.078859,0.072273,0.027530,0.069461,0.017579,0.102544,-0.062303,...,0.055241,0.122074,-0.190943,0.092834,-0.077128,0.039208,0.082657,-0.092883,0.037230,-0.012736,-0.025966,0.094178,-0.025190,-0.027361,0.095491,0.015492,-0.064970,-0.071319,0.027870,-0.083869,0.012082,-0.037284,-0.057759,-0.027165,0.063889,0.050774,-0.033627,-0.029633,-0.057173,-0.079161,-0.010857,-0.011950,0.014355,-0.036770,0.027012,-0.018304,0.054549,-0.025695,0.024704,0.020798
3,0.089127,-0.036122,0.016222,0.008765,0.003601,-0.090113,-0.031313,0.001226,-0.036233,0.068233,-0.019967,-0.023705,-0.031604,-0.004427,0.007346,-0.017877,-0.023862,-0.040175,-0.053181,0.073182,-0.009643,-0.084058,-0.001391,-0.048651,0.023768,-0.074135,-0.012268,-0.053803,-0.163037,-0.050910,-0.113678,-0.006272,0.084574,0.035617,-0.058700,-0.052889,0.047448,-0.165194,0.032990,-0.001273,...,-0.002027,-0.057775,-0.096597,-0.002291,-0.008672,-0.011951,-0.002483,-0.011551,0.006823,-0.096913,-0.108310,-0.026958,-0.036248,0.007023,-0.048929,0.002826,0.000277,-0.051779,0.024913,0.008933,0.002288,0.024794,0.028995,-0.023762,-0.014749,-0.103868,0.033335,-0.027550,0.041960,0.018053,-0.032463,-0.078741,0.019445,-0.035155,-0.044283,-0.013340,0.019571,0.064038,0.060498,0.033238
4,0.048744,-0.032325,-0.015675,-0.007369,0.000447,0.011759,-0.001468,0.006373,-0.016757,-0.029458,-0.011505,-0.013166,-0.020970,0.008024,0.047492,0.022263,-0.019979,0.017093,0.028145,0.010924,0.003794,-0.014587,0.005518,-0.001308,0.013292,-0.013622,0.001279,-0.042280,0.072534,0.046520,-0.044927,0.009034,0.052825,-0.009499,0.014518,-0.029463,0.022877,-0.018070,-0.000014,0.039887,...,0.010364,0.002448,0.015952,-0.009006,-0.015320,-0.000250,0.007075,-0.021691,0.014537,-0.009117,0.011424,-0.009118,-0.012286,0.000484,-0.010353,0.007390,0.009682,0.020651,0.001250,-0.004427,-0.033300,-0.029426,-0.009642,-0.018134,0.016246,0.013741,0.084185,0.018657,-0.021536,0.012212,0.024174,-0.014883,0.047891,-0.035908,-0.000471,-0.021909,-0.011028,0.026980,-0.000394,-0.012760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30353,0.057663,0.013129,-0.020833,-0.001607,0.004480,-0.011803,0.020142,0.019665,0.009583,-0.022362,-0.010536,-0.005650,-0.006125,0.001473,-0.002102,0.015383,0.008447,-0.011098,-0.000839,0.008856,0.013702,-0.001929,0.004970,0.003112,-0.011265,0.004643,-0.020317,0.011792,0.003491,0.005474,0.007828,0.014418,0.014131,-0.025188,0.029832,-0.028379,0.004643,0.000055,0.039855,-0.020242,...,-0.007601,0.009516,0.015287,0.003646,0.002997,-0.001540,0.001460,0.005337,-0.004027,-0.002905,-0.018620,0.007100,0.002577,-0.011842,0.001396,0.009219,0.009959,0.000924,-0.011722,-0.001029,-0.018617,-0.008539,-0.002603,0.002300,-0.020595,-0.002952,0.017039,0.008938,-0.003108,-0.002384,-0.015397,0.002615,0.020446,0.005199,-0.023553,0.025607,-0.001610,-0.003889,0.010578,-0.004013
30354,0.026405,0.004534,-0.024934,-0.002837,0.014869,-0.015218,0.019689,0.001312,0.009330,-0.010267,-0.016996,-0.002151,0.010911,-0.005686,-0.010400,0.014477,0.003418,-0.003020,0.010266,0.019844,0.003026,-0.000139,-0.000684,-0.000951,-0.000595,-0.010349,-0.002450,0.005222,0.000876,0.001311,-0.006688,0.028746,0.005861,-0.013045,0.004849,-0.030208,0.004945,0.021630,0.032414,-0.019238,...,-0.031295,0.007311,0.005365,0.014342,0.005618,-0.003218,0.013663,0.013562,-0.004078,0.003684,0.008836,-0.020366,-0.004701,-0.004534,0.006570,0.014266,-0.015670,0.006041,0.008576,0.002884,0.002657,0.041568,0.017536,0.010051,0.008243,-0.025991,0.042816,0.048357,-0.008904,0.019113,-0.037699,0.009912,-0.008248,-0.035545,-0.030781,-0.039217,-0.002311,0.053385,0.065795,-0.030086
30355,0.037829,-0.031381,0.003972,-0.017872,0.036632,0.002411,0.003863,-0.025432,-0.016808,0.014805,-0.003867,0.000124,-0.010415,-0.003006,0.027257,-0.010980,-0.003312,-0.005554,-0.047477,0.010203,0.005770,-0.007696,0.028607,-0.021116,-0.012076,0.004945,-0.024608,0.003053,-0.023037,-0.026741,0.025278,-0.001592,-0.023042,-0.005374,0.039301,-0.006162,0.002696,-0.011815,0.011948,-0.000450,...,0.016288,-0.005978,0.011094,0.015648,-0.045122,-0.048558,-0.041756,-0.035018,0.042491,0.002235,0.007968,0.067415,-0.016857,0.060599,-0.087056,-0.026857,0.014162,0.006174,-0.028878,0.014233,-0.042417,-0.008160,0.017868,-0.003570,-0.048655,0.011265,0.006699,0.006104,-0.027528,-0.001272,-0.057414,0.013150,-0.027908,-0.051704,-0.004589,-0.002235,-0.040323,-0.036552,0.022460,0.041556
30356,0.155643,0.070355,0.108168,-0.080106,-0.054712,-0.009929,-0.080249,-0.066185,-0.008704,0.008319,-0.015699,0.035077,0.028645,-0.028835,-0.036274,0.022125,-0.002354,0.007160,-0.101165,0.044628,-0.037495,-0.010444,-0.051239,-0.035396,-0.047515,0.078990,0.015851,0.083472,0.053709,0.012689,-0.101401,-0.026007,0.054682,-0.060926,0.028837,0.004075,-0.049512,-0.001987,-0.094516,0.017204,...,-0.008333,0.040039,-0.002372,0.045576,-0.061276,-0.029679,0.003192,0.023543,-0.077398,0.018254,0.030235,-0.008607,0.050913,-0.009738,-0.022281,-0.022145,0.001485,0.054499,0.043702,-0.005432,0.009243,-0.014230,-0.013909,-0.048694,0.043606,-0.021457,-0.015016,0.012927,-0.048902,-0.016042,-0.004639,0.033289,0.027973,-0.003952,-0.030039,0.010639,0.012589,0.027335,0.001579,0.011857


## Agrupamento

#### K-means

In [62]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=73)
kmeans.fit(X_REDUCED)
y_kmeans = kmeans.predict(X_REDUCED)

In [63]:
kmeans_prediction_df = filmes_df
kmeans_prediction_df["prediction"] = y_kmeans
kmeans_prediction_df

Unnamed: 0,genres,averageRating,numVotes,sinopse,primaryTitle,startYear,runtimeMinutes,actors_names,directors_names,prediction
0,Family,7.1,387992,When two kids find and play a magical board ga...,Jumanji,1995,104,['Robin Williams' 'Jonathan Hyde' 'Jonathan Hy...,['Joe Johnston'],71
1,Romance,6.7,30265,John and Max resolve to save their beloved bai...,Grumpier Old Men,1995,101,['Walter Matthau' 'Jack Lemmon' 'Burgess Mered...,['Howard Deutch'],46
2,Romance,6.0,12585,"Based on Terry McMillan's novel, this film fol...",Waiting to Exhale,1995,124,['Gregory Hines' 'Dennis Haysbert' 'Mykelti Wi...,['Forest Whitaker'],53
3,"Romance,Family",6.1,42555,George Banks must deal not only with his daugh...,Father of the Bride Part II,1995,106,['Steve Martin' 'Martin Short' 'George Newbern...,['Charles Shyer'],46
4,"Crime,Action",8.3,738636,A group of high-end professional thieves start...,Heat,1995,170,['Al Pacino' 'Robert De Niro' 'Val Kilmer' 'Jo...,['Michael Mann'],28
...,...,...,...,...,...,...,...,...,...,...
30353,Biography,7.2,1623,"""I'll look at you, but not at the camera. It c...",Jane B. for Agnes V.,1988,99,['Jean-Pierre Léaud' 'Philippe Léotard' 'Phili...,['Agnès Varda'],46
30354,Biography,6.3,55,A musical biography of the great Russian class...,Rimskiy-Korsakov,1953,88,['Grigori Belov' 'Nikolay Cherkasov' 'Aleksand...,['Gennadiy Kazanskiy' 'Grigoriy Roshal'],46
30355,"Crime,Horror",5.3,391,An evil genius uses poison gas to avenge himse...,The Carpet of Horror,1962,85,['Joachim Fuchsberger' 'Antonio Casas' 'Fernan...,['Harald Reinl'],46
30356,Romance,7.5,2127,A young man narrates his past on how his fathe...,Santhosh Subramaniyam,2008,177,['Jayam Ravi' 'Prakash Raj' 'Sayaji Shinde' 'S...,['Mohan Raja'],71


In [64]:
group1 = kmeans_prediction_df[kmeans_prediction_df["prediction"] == 1]
group1

Unnamed: 0,genres,averageRating,numVotes,sinopse,primaryTitle,startYear,runtimeMinutes,actors_names,directors_names,prediction
263,Romance,6.9,2106,"Upon returning home after a ten year absence, ...",Colonel Chabert,1994,110,['Gérard Depardieu' 'Fabrice Luchini' 'André D...,['Yves Angelo'],1
292,Romance,6.4,8355,Carly moves with her military husband and thei...,Blue Sky,1994,101,"['Tommy Lee Jones' 'Powers Boothe' ""Chris O'Do...",['Tony Richardson'],1
408,Action,7.9,91913,An aging group of outlaws in 1913 Texas look f...,The Wild Bunch,1969,135,['William Holden' 'Ernest Borgnine' 'Robert Ry...,['Sam Peckinpah'],1
434,Action,6.6,60879,"A U.S. Army officer, despondent about a deadly...",Courage Under Fire,1996,116,['Denzel Washington' 'Lou Diamond Phillips' 'M...,['Edward Zwick'],1
456,Action,5.2,9884,To keep the loyalty of a village during the Vi...,Operation Dumbo Drop,1995,107,['Danny Glover' 'Ray Liotta' 'Denis Leary' 'Do...,['Simon Wincer'],1
...,...,...,...,...,...,...,...,...,...,...
29746,"Musical,Family",2.1,414,"Running away from his evil guardian Mombi, Tip...",The Wonderful Land of Oz,1969,72,['Chan Mahon' 'Allen Joseph' 'George Wadsworth...,['Barry Mahon'],1
29758,Horror,4.7,264,When a mysterious virus breaks out in an isola...,Zombiepura,2018,85,['Alaric' 'Benjamin Heng' 'Rayve Tay' 'Edward ...,['Jacen Tan'],1
29797,"Crime,Biography,Action",5.6,200,An Army ex-con electronics engineer sells his ...,Wiretapper,1955,80,['Bill Williams' 'Douglas Kennedy' 'Richard Be...,['Dick Ross'],1
29975,Action,7.5,1697,"Tanya Degurechaff continues her misadventures,...",Saga of Tanya the Evil - The Movie,2019,115,['Susumu Akagi' 'Takatsugu Awazu' 'Chô' 'Hayat...,['Yutaka Uemura'],1
