In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import fetch_20newsgroups_vectorized

In [2]:
data = fetch_20newsgroups_vectorized(remove=('headers', 'footers', 'quotes'))
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [3]:
X = data['data']
X.shape

(11314, 101631)

In [4]:
X.sum(axis=0).T.A.ravel()

array([33.87521141, 18.99172753,  0.23992855, ...,  0.07106691,
        0.10314212,  0.10314212])

In [5]:
df_word_count = pd.DataFrame(
    {
        'word_count': X.sum(axis=0).T.A.ravel(),
        'id': list(range(len(data['feature_names']))),
    },
    index=data['feature_names'],
)

In [6]:
good_words = df_word_count['word_count'].sort_values(ascending=False).iloc[:20000].index

In [7]:
good_columns = list(df_word_count.loc[good_words]['id'])

In [8]:
X = X[:, good_columns]

In [9]:
X.shape

(11314, 20000)

In [24]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=300, random_state=42)

U_times_S = svd.fit_transform(X)
S = svd.singular_values_
Vt = svd.components_

U_times_S.shape, S.shape, Vt.shape

((11314, 300), (300,), (300, 20000))

**Exercicio** Qual o significado de:

(a) Uma linha de `U_times_S`?

(b) Uma coluna de `Vt`

(c) Uma *coluna* de `U_times_S`?

(d) Uma *linha* de `Vt`?

(e) Os valores de `S`?

**Solução**

(a) Representação aproximada do conteudo de um documento == "id" do documento.

(b) Representação de uma palavra.

(c) Uma nivel de detalhe do corpus inteiro.

(d) Um nivel de detalhe do significado das palavras.

(e) As importancias de cada nível de detalhe. 

In [25]:
from sklearn.cluster import KMeans

In [26]:
clusterer = KMeans(n_clusters=20, random_state=42)

In [27]:
# Como Vt tem dimensões (n_components, n_palavras), precisamos transpor.
V = Vt.T

In [28]:
df_V = pd.DataFrame(V, index=good_words)
df_V = df_V.div(df_V.apply(np.linalg.norm, axis=1), axis=0)

In [29]:
clusterer.fit(df_V)

In [30]:
df = pd.DataFrame({
    'word': good_words,
    'cluster': clusterer.labels_,
})

In [31]:
df['cluster'].value_counts()

cluster
9     3125
15    2418
1     2370
0     2360
11    1631
14     852
16     840
13     784
4      762
17     743
10     556
3      537
18     475
6      455
5      454
19     443
7      394
8      321
12     293
2      187
Name: count, dtype: int64

In [32]:
df_distances = pd.DataFrame(
    clusterer.transform(df_V),
    index=good_words,
)

In [33]:
df_distances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
the,1.008155,1.006825,1.250163,1.065924,1.040797,1.065292,1.088704,1.209348,1.070223,1.004807,1.047840,1.014551,1.075028,1.043963,1.033948,1.008031,1.042193,1.040243,1.051274,1.141266
to,1.008312,1.006470,1.249389,1.066081,1.040816,1.065000,1.088660,1.210171,1.070000,1.004514,1.050856,1.014710,1.074742,1.042546,1.031912,1.007429,1.041597,1.040179,1.051814,1.144626
of,1.006367,1.005997,1.248985,1.065108,1.041802,1.062912,1.086641,1.207232,1.066559,1.003449,1.043532,1.015188,1.076189,1.040643,1.027642,1.006088,1.039050,1.041282,1.048571,1.143501
and,1.006740,1.006121,1.248986,1.065416,1.039666,1.064084,1.087804,1.208224,1.068487,1.003696,1.045405,1.013771,1.073741,1.043038,1.029510,1.006307,1.038053,1.040870,1.048596,1.143293
is,1.008350,1.007376,1.249189,1.067688,1.040959,1.064933,1.090382,1.203650,1.069890,1.004931,1.049042,1.015529,1.074549,1.046245,1.030032,1.008088,1.040249,1.041251,1.050874,1.142457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
garry,1.035885,1.008098,0.869162,1.075341,0.923570,1.070723,1.091431,1.260793,1.064886,1.003859,1.061024,1.033338,1.082335,1.047386,1.045406,1.016069,1.057653,1.038869,1.042247,1.131691
som,1.005508,0.990295,1.252287,1.078215,1.050069,1.078737,1.096027,1.223874,1.075918,1.021062,1.058552,1.009506,1.078648,1.038932,1.041312,1.017181,1.058747,0.993371,1.068216,1.135568
decorative,0.993995,1.004585,1.256081,1.056726,1.044903,1.075158,1.100641,1.176989,1.105497,1.018595,1.062287,1.025581,1.082934,1.055105,1.052629,1.004512,1.056530,1.032252,1.061746,1.147469
mush,1.011609,1.016886,1.211304,1.069904,1.038133,1.068263,1.083508,1.225999,1.081430,0.999677,1.042172,1.014609,1.076908,1.025475,1.024685,1.015091,1.041973,1.052600,1.067078,1.155987


**Exercicio** O que cada cluster significa?

In [34]:
for cluster in range(20):
    print(f'Cluster {cluster}:')
    for word in df_distances[cluster].sort_values().head(20).index:
        print(f'   {df_distances[cluster][word]}: {word}')


Cluster 0:
   0.9066882122988956: armenians
   0.9094550968392294: armenian
   0.914578315995887: turks
   0.9155212023638342: asala
   0.9157529650932505: sdpa
   0.9158130234673143: 1914
   0.9181238256271175: kars
   0.9204672461472384: dro
   0.9208420509886648: erzurum
   0.9208590698651609: turkiye
   0.9208883791469651: ottoman
   0.9213958158815804: genocide
   0.9230459019938245: exterminated
   0.9231545549208802: kurds
   0.9233274651663894: argic
   0.9246497326848931: turkish
   0.9260568690722947: massacres
   0.9260837183886963: extermination
   0.9266474770679719: karabakh
   0.9278549338475836: armenia
Cluster 1:
   0.9460126392204623: down
   0.9500094612086675: boskie
   0.9507506619078111: shawn
   0.9532970520676475: frisco
   0.9537481107028822: powers
   0.955727839442447: coated
   0.9581355992182551: high
   0.9582065203929282: meehan
   0.9593709857800622: tear
   0.9596091134089673: filtering
   0.9605080323989483: cotton
   0.9605395474172187: 680
   0.96079

**Exercicio** Qual o melhor número de clusters? Como identificar?