In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import fetch_20newsgroups_vectorized

In [17]:

data = fetch_20newsgroups_vectorized(remove=('headers', 'footers', 'quotes'))
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [18]:
X = data['data']
X.shape

(11314, 101631)

In [19]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=10, random_state=42)

U_times_S = svd.fit_transform(X)
S = svd.singular_values_
Vt = svd.components_

U_times_S.shape, S.shape, Vt.shape

((11314, 10), (10,), (10, 101631))

**Exercicio** Qual o significado de:

(a) Uma linha de `U_times_S`?

(b) Uma coluna de `Vt`

(c) Uma *coluna* de `U_times_S`?

(d) Uma *linha* de `Vt`?

(e) Os valores de `S`?

**Solução**

(a) Representação aproximada do conteudo de um documento == "id" do documento.

(b) Representação de uma palavra.

(c) Uma nivel de detalhe do corpus inteiro.

(d) Um nivel de detalhe do significado das palavras.

(e) As importancias de cada nível de detalhe. 

In [20]:
from sklearn.cluster import KMeans

In [32]:
clusterer = KMeans(n_clusters=20, random_state=42)

In [33]:
# Como Vt tem dimensões (n_components, n_palavras), precisamos transpor.
V = Vt.T

In [41]:
df_V = pd.DataFrame(V, index=data['feature_names'])

In [44]:
df_V_norm = df_V.div(df_V.sum(axis=0), axis=1)

In [45]:
clusterer.fit(df_V_norm)

In [46]:
df = pd.DataFrame({
    'word': data['feature_names'],
    'cluster': clusterer.labels_,
})

In [47]:
df['cluster'].value_counts()

cluster
1     9130
7     8149
0     6815
3     6680
18    5875
10    5826
11    5717
13    5577
17    5548
2     5356
9     5355
4     4966
16    4295
14    4293
12    3614
6     3536
19    3174
8     3115
5     2699
15    1911
Name: count, dtype: int64

In [48]:
df_distances = pd.DataFrame(
    clusterer.transform(df_V_norm),
    index=data['feature_names'],
)

In [49]:
df_distances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
00,0.000481,0.000353,0.000103,0.000774,0.000271,0.000493,0.000178,0.000204,0.000940,0.000346,0.000524,0.000207,0.000716,0.000609,0.000458,0.000706,0.000128,0.000346,0.000119,0.000256
000,0.000832,0.000178,0.000420,0.001123,0.000601,0.000171,0.000398,0.000203,0.001292,0.000067,0.000192,0.000546,0.001067,0.000961,0.000802,0.000359,0.000274,0.000694,0.000400,0.000207
0000,0.000157,0.000708,0.000323,0.000423,0.000191,0.000850,0.000393,0.000563,0.000576,0.000710,0.000894,0.000218,0.000355,0.000257,0.000142,0.001071,0.000470,0.000116,0.000384,0.000586
00000,0.000245,0.001003,0.000606,0.000205,0.000440,0.001131,0.000658,0.000854,0.000305,0.000996,0.001186,0.000498,0.000110,0.000151,0.000238,0.001358,0.000756,0.000355,0.000676,0.000861
000000,0.000286,0.001085,0.000694,0.000086,0.000538,0.001231,0.000764,0.000936,0.000199,0.001087,0.001271,0.000570,0.000106,0.000156,0.000341,0.001453,0.000843,0.000421,0.000744,0.000965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzzoh,0.000308,0.001110,0.000722,0.000025,0.000571,0.001259,0.000796,0.000963,0.000178,0.001115,0.001298,0.000596,0.000139,0.000186,0.000376,0.001480,0.000870,0.000448,0.000768,0.000994
zzzzzz,0.000520,0.000399,0.000179,0.000799,0.000255,0.000485,0.000090,0.000277,0.000951,0.000377,0.000568,0.000282,0.000721,0.000635,0.000448,0.000714,0.000210,0.000393,0.000259,0.000224
zzzzzzt,0.000526,0.001316,0.000925,0.000266,0.000751,0.001454,0.000976,0.001170,0.000101,0.001318,0.001506,0.000807,0.000275,0.000392,0.000544,0.001681,0.001076,0.000658,0.000983,0.001183
³ation,0.001027,0.000281,0.000614,0.001319,0.000794,0.000142,0.000582,0.000382,0.001489,0.000224,0.000096,0.000741,0.001263,0.001158,0.000997,0.000174,0.000466,0.000889,0.000586,0.000375


**Exercicio** O que cada cluster significa?

**Exercicio** Qual o melhor número de clusters? Como identificar?