In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import fetch_20newsgroups_vectorized

In [2]:
data = fetch_20newsgroups_vectorized(remove=('headers', 'footers', 'quotes'))
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [16]:
X = data['data']
X.shape

(11314, 101631)

In [30]:
df_word_count = pd.DataFrame(X.sum(axis=0).T, index=data['feature_names'])

In [34]:
good_words = df_word_count[0].sort_values(ascending=False).iloc[:20000].index

In [4]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=10, random_state=42)

U_times_S = svd.fit_transform(X)
S = svd.singular_values_
Vt = svd.components_

U_times_S.shape, S.shape, Vt.shape

((11314, 10), (10,), (10, 101631))

**Exercicio** Qual o significado de:

(a) Uma linha de `U_times_S`?

(b) Uma coluna de `Vt`

(c) Uma *coluna* de `U_times_S`?

(d) Uma *linha* de `Vt`?

(e) Os valores de `S`?

**Solução**

(a) Representação aproximada do conteudo de um documento == "id" do documento.

(b) Representação de uma palavra.

(c) Uma nivel de detalhe do corpus inteiro.

(d) Um nivel de detalhe do significado das palavras.

(e) As importancias de cada nível de detalhe. 

In [5]:
from sklearn.cluster import KMeans

In [6]:
clusterer = KMeans(n_clusters=20, random_state=42)

In [7]:
# Como Vt tem dimensões (n_components, n_palavras), precisamos transpor.
V = Vt.T

In [8]:
df_V = pd.DataFrame(V, index=data['feature_names'])
df_V = df_V.div(df_V.apply(np.linalg.norm, axis=1), axis=0)

In [9]:
clusterer.fit(df_V)

In [10]:
df = pd.DataFrame({
    'word': data['feature_names'],
    'cluster': clusterer.labels_,
})

In [11]:
df['cluster'].value_counts()

cluster
4     8840
8     7020
1     6983
5     6293
14    6106
7     5658
19    5152
2     5060
11    4899
0     4884
13    4758
18    4728
16    4468
10    4242
17    4146
12    4001
3     3831
15    3811
9     3634
6     3117
Name: count, dtype: int64

In [12]:
df_distances = pd.DataFrame(
    clusterer.transform(df_V),
    index=data['feature_names'],
)

In [13]:
df_distances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
00,0.560492,1.025416,1.168969,1.377959,1.304439,1.285858,1.136710,0.454621,1.291081,1.182945,1.198947,0.960432,1.359697,0.962260,0.815522,1.146212,1.185086,1.440446,0.952837,1.255574
000,0.948902,0.896819,0.685934,1.166192,1.329607,1.154961,1.340965,0.917756,1.364949,1.018251,1.231149,1.043145,1.178419,1.195667,0.628417,1.294263,1.232069,1.229038,1.187283,1.252093
0000,0.939649,1.546745,1.204887,1.403809,1.524793,1.404985,1.061367,1.175291,1.271916,0.851759,1.108676,1.199062,1.085327,1.119034,1.427594,0.773015,1.137855,1.541121,1.543623,1.462054
00000,0.835602,1.637309,1.275744,1.328179,1.394276,1.389731,0.828891,1.121569,1.582750,1.022134,1.243593,1.129452,1.041157,0.958150,1.570395,0.690254,1.170369,1.343305,1.593140,1.720831
000000,0.657184,1.026079,1.117082,1.543906,1.006299,1.009237,1.309876,1.103442,1.416260,1.143527,0.937324,0.516046,1.228445,0.779833,1.189628,0.820741,1.186531,1.222220,1.154382,1.408044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzzoh,1.104676,1.053346,1.193036,1.496200,0.197937,1.331321,1.214215,1.182746,1.396808,1.460067,1.110612,1.088254,1.499131,0.627060,1.293888,1.112939,1.257026,1.019523,0.913883,1.190385
zzzzzz,1.260217,1.740825,1.494897,0.983961,1.709040,1.328603,0.640909,1.186568,1.362261,1.011845,1.270497,1.407751,0.995955,1.389732,1.688765,1.152020,1.279425,1.353038,1.673180,1.670196
zzzzzzt,0.798010,1.467738,1.321821,1.486993,1.286152,1.045942,1.104197,1.292188,1.450998,1.062821,1.177643,0.752978,1.074773,0.883426,1.595898,0.580896,1.006832,1.273155,1.538068,1.649999
³ation,1.104195,0.877424,0.608537,1.190866,1.354811,1.267956,1.400162,0.936934,1.313293,0.988908,1.200426,1.207888,1.295401,1.246776,0.478263,1.413981,1.286311,1.337871,1.186850,1.140723


**Exercicio** O que cada cluster significa?

In [14]:
for cluster in range(20):
    print(f'Cluster {cluster}:')
    for word in df_distances[cluster].sort_values().head(20).index:
        print(f'   {df_distances[cluster][word]}: {word}')


Cluster 0:
   0.2886080113319972: gpc
   0.2890705320013189: teapot
   0.29025423558901986: 215
   0.29223452665166433: cyberware
   0.29223452665166433: ahpcrc
   0.29223452665166433: spaceball
   0.29223452665166433: clrview
   0.2934108688827475: wavefront
   0.2952501076789915: visualizer
   0.2952501076789915: softlab
   0.29735858100285595: avs
   0.29988108331817853: polyview
   0.29988108331817853: swedishchef
   0.29988108331817853: 80301
   0.29988108331817853: datafiles
   0.29988108331817853: omicron
   0.29988108331817853: scivi
   0.3025555405051017: gvl
   0.3025555405051017: icol
   0.3027042098511992: idl
Cluster 1:
   0.2101197085761219: 0_
   0.23858319935805694: pne
   0.24308261994532518: y0w
   0.24346731725513937: 1z6ei
   0.2474972336384418: z6e
   0.24794390308791966: bizw
   0.24924467946855913: zdk
   0.25129560354795505: m9f9fq
   0.2515395133791109: 6e1
   0.25368781645022614: 9f9f9f9f
   0.25633940223466645: bql
   0.25652292695037854: pnei4
   0.258044115

**Exercicio** Qual o melhor número de clusters? Como identificar?