In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler

In [2]:
#Simula uma lista de documentos
example = ["ufal ic", "curso de verao no ic da ufal so no verao de 2022",
        "ufal fale", "curso de inverno na fale ufal", 
        "ufmg ic","curso de verao no ic da ufmg",
        "ufmg fale","curso de inverno na fale ufmg",
        "unicamp ic","curso de inverno no ic unicamp",
        "campus ac simoes"]

In [3]:
#Com a lista de documentos simulados cria uma bag of words (bow)
#Podemos analisar que o parametro "binary" esta marcado como falso então em cada coluna vai contar a quantidade de vezes que uma palavra aparece no documento
#O "analyzer" esta como word então so serão levadas em contas palavras completas
bow = CountVectorizer(binary=False, analyzer='word', ngram_range=(1,1))
mx = bow.fit_transform(example).todense()
terms = bow.get_feature_names()
pd.DataFrame(mx, columns=terms, index=example)

Unnamed: 0,2022,ac,campus,curso,da,de,fale,ic,inverno,na,no,simoes,so,ufal,ufmg,unicamp,verao
ufal ic,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
curso de verao no ic da ufal so no verao de 2022,1,0,0,1,1,2,0,1,0,0,2,0,1,1,0,0,2
ufal fale,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
curso de inverno na fale ufal,0,0,0,1,0,1,1,0,1,1,0,0,0,1,0,0,0
ufmg ic,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
curso de verao no ic da ufmg,0,0,0,1,1,1,0,1,0,0,1,0,0,0,1,0,1
ufmg fale,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
curso de inverno na fale ufmg,0,0,0,1,0,1,1,0,1,1,0,0,0,0,1,0,0
unicamp ic,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
curso de inverno no ic unicamp,0,0,0,1,0,1,0,1,1,0,1,0,0,0,0,1,0


In [4]:
#Mudando o parametro bynary para true ele apenas vai salvar o valor 1 para se a palavra existir no documento ou 0 para não existir
#Ignorando totalmente a quantidade de vezes que uma palavra se repete em determinado documento
bow = CountVectorizer(binary=True, analyzer='word', ngram_range=(1,1))
mx = bow.fit_transform(example).todense()
terms = bow.get_feature_names()
pd.DataFrame(mx, columns=terms, index=example)

Unnamed: 0,2022,ac,campus,curso,da,de,fale,ic,inverno,na,no,simoes,so,ufal,ufmg,unicamp,verao
ufal ic,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0
curso de verao no ic da ufal so no verao de 2022,1,0,0,1,1,1,0,1,0,0,1,0,1,1,0,0,1
ufal fale,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
curso de inverno na fale ufal,0,0,0,1,0,1,1,0,1,1,0,0,0,1,0,0,0
ufmg ic,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
curso de verao no ic da ufmg,0,0,0,1,1,1,0,1,0,0,1,0,0,0,1,0,1
ufmg fale,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
curso de inverno na fale ufmg,0,0,0,1,0,1,1,0,1,1,0,0,0,0,1,0,0
unicamp ic,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0
curso de inverno no ic unicamp,0,0,0,1,0,1,0,1,1,0,1,0,0,0,0,1,0


In [5]:
#Aqui adicionamos o parametro "stop_words" que representa as palavras consideradas inuteis e sçao totalmente ignoradas
#Alem disso aumentamos o limite maximo do ngram_range permitindo agora analise de duas palavras
#Por exemplo "ac simoes", "curso inverno", "ufal ic" que não existiam na tabela anterior
bow = CountVectorizer(binary=False, analyzer='word', stop_words=['da', 'no', 'na', 'de'], ngram_range=(1,2))
mx = bow.fit_transform(example).todense()
terms = bow.get_feature_names()
pd.DataFrame(mx, columns=terms, index=example)

Unnamed: 0,2022,ac,ac simoes,campus,campus ac,curso,curso inverno,curso verao,fale,fale ufal,...,ufal ic,ufal so,ufmg,ufmg fale,ufmg ic,unicamp,unicamp ic,verao,verao 2022,verao ic
ufal ic,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
curso de verao no ic da ufal so no verao de 2022,1,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,2,1,1
ufal fale,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
curso de inverno na fale ufal,0,0,0,0,0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
ufmg ic,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
curso de verao no ic da ufmg,0,0,0,0,0,1,0,1,0,0,...,0,0,1,0,0,0,0,1,0,1
ufmg fale,0,0,0,0,0,0,0,0,1,0,...,0,0,1,1,0,0,0,0,0,0
curso de inverno na fale ufmg,0,0,0,0,0,1,1,0,1,0,...,0,0,1,0,0,0,0,0,0,0
unicamp ic,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
curso de inverno no ic unicamp,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [6]:
#Mudando "analyzer" para char passamos a analisar caractere no lugar de palavras
bow = CountVectorizer(analyzer='char')
mx = bow.fit_transform(example).todense()
terms = bow.get_feature_names()
pd.DataFrame(mx, columns=terms, index=example)

Unnamed: 0,Unnamed: 1,0,2,a,c,d,e,f,g,i,l,m,n,o,p,r,s,u,v
ufal ic,1,0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0,1,0
curso de verao no ic da ufal so no verao de 2022,11,1,3,4,2,3,4,1,0,1,1,0,2,6,0,3,2,2,2
ufal fale,1,0,0,2,0,0,1,2,0,0,2,0,0,0,0,0,0,1,0
curso de inverno na fale ufal,5,0,0,3,1,1,3,2,0,1,2,0,3,2,0,2,1,2,1
ufmg ic,1,0,0,0,1,0,0,1,1,1,0,1,0,0,0,0,0,1,0
curso de verao no ic da ufmg,6,0,0,2,2,2,2,1,1,1,0,1,1,3,0,2,1,2,1
ufmg fale,1,0,0,1,0,0,1,2,1,0,1,1,0,0,0,0,0,1,0
curso de inverno na fale ufmg,5,0,0,2,1,1,3,2,1,1,1,1,3,2,0,2,1,2,1
unicamp ic,1,0,0,1,2,0,0,0,0,2,0,1,1,0,1,0,0,1,0
curso de inverno no ic unicamp,5,0,0,1,3,1,2,0,0,3,0,1,4,3,1,2,1,2,1


In [7]:
#Exibe o dataset a ser trabalhado
tweets = pd.read_csv("data/nCoV_tweets.csv", index_col=0, parse_dates=['dt'])
tweets

Unnamed: 0,dt,txt
7,2020-02-05 08:27:07+00:00,what the actual -
9,2020-02-05 08:27:08+00:00,"@jason_om It's not Left-Right, not countries n..."
13,2020-02-05 08:27:10+00:00,Uh oh.. this isnt very good news
17,2020-02-05 08:27:12+00:00,Organisers will have contingency plans for hea...
24,2020-02-05 08:27:14+00:00,&lt;FACT&gt;Regardless of origin of #coronavir...
...,...,...
32748,2020-02-06 11:10:00+00:00,Researchers say the #coronavirus may be more ...
32750,2020-02-06 11:09:56+00:00,Is it like a peach? Rotten peaches contaminat...
32754,2020-02-06 11:10:03+00:00,"#coronavirus \nFeb. 06, 2020 07:10:01 PM GMT ..."
32757,2020-02-06 11:10:04+00:00,Our #commercial #healthandsafety and #ukemplaw...


In [8]:
#Cria uma bag of word aceitando apenas ngramas de tamanho 3 (trigramas), e usando todas as stop_words da linga inglesa
bow = CountVectorizer(min_df=4, stop_words='english', ngram_range=(3,3))
mx = bow.fit_transform(tweets['txt']).todense()
mx.shape

(6706, 618)

In [9]:
#Avalia o desempenho da quantidade de clusters utilizando o silhouette_score
#E exibe o melhor valor para k entre 2 e 10
best_sil = float("-inf")
for k in range(2,11):
    cluster = make_pipeline(MaxAbsScaler(), KMeans(n_clusters=k, random_state=0))
    cluster.fit(mx)
    p = cluster.predict(mx)
    
    sil = silhouette_score(mx, p)
    if sil > best_sil:
        best_k = k
    print("K = {} - Silhouette: {}".format(k, sil))

print("\n\nMelhor K: ", best_k)

K = 2 - Silhouette: 0.7849594690794792
K = 3 - Silhouette: 0.7872649732846221
K = 4 - Silhouette: 0.7899806458433944
K = 5 - Silhouette: 0.7916851880937629
K = 6 - Silhouette: 0.7917029047299742
K = 7 - Silhouette: 0.7967897372340083
K = 8 - Silhouette: 0.7965854032241488
K = 9 - Silhouette: 0.7980160269702951
K = 10 - Silhouette: 0.7987810869805089


Melhor K:  10


In [10]:
#Exibe todos os trigramas que estão na bag of words
terms = bow.get_feature_names()
terms

['00 cases 24',
 '000 000 israelis',
 '000 confirmed cases',
 '000 employees weeks',
 '000 israelis perished',
 '018 confirmed cases',
 '0205414305or whatsapp 0555171905',
 '023 infections 24',
 '06 02 2020',
 '08 00 cases',
 '10 coronavirus cases',
 '10 infected coronavirus',
 '10 minute wedding',
 '10 websites track',
 '100 000 israelis',
 '12 cases coronavirus',
 '154 023 infections',
 '15th person australia',
 '20 days surprised',
 '2019 ncov advice',
 '2019 ncov coronavirus',
 '2019 ncov https',
 '2019 novel coronavirus',
 '2020 current statistics',
 '24 000 people',
 '24 324 confirmed',
 '24 589 deaths',
 '241 dua session',
 '24x7 control room',
 '25 outside china',
 '27 000 employees',
 '28 018 confirmed',
 '30 hours birth',
 '324 confirmed cases',
 '37 year old',
 '4th february 2020',
 '564 infant infections',
 '700 cases coronavirus',
 '9orx4j6buu virus coronavirus',
 '9orx4j6buu virus https',
 'aboard cruise ship',
 'accidentally leak true',
 'accidentally leaked real',
 'add

In [11]:
#Utiliza o Kmeans com o valor de k igual a 10 (foi o que desempenhou melhor na avaliação do silhouette_score) 
k = 10
cluster = make_pipeline(MaxAbsScaler(), KMeans(n_clusters=k, random_state=0))
cluster.fit(mx)
p = cluster.predict(mx)

for c in np.unique(p):
    print("\nCluster {} - Size {}".format(c, (p == c).sum()))
    rank = pd.Series(np.array(mx[p==c].mean(axis=0)).squeeze(), index=terms).sort_values().tail(20)
    print(rank)


Cluster 0 - Size 6453
coronavirus update wuhan                 0.002015
coronavirus death toll                   0.002170
30 hours birth                           0.002325
accidentally leaked real                 0.002479
guan zhuang bing                         0.002634
tencent accidentally leaked              0.002634
zhuang bing du                           0.002634
just 30 hours                            0.002789
novel coronavirus 2019                   0.002944
cruise ship japan                        0.003099
2019 novel coronavirus                   0.003254
news china coronavirus                   0.003409
world health organization                0.003719
coronavirus coronaoutbreak coronanews    0.004184
coronaoutbreak coronanews ncov2019       0.004184
coronavirus asiannetwalking https        0.004339
health coronavirus asiannetwalking       0.004339
amid coronavirus outbreak                0.005114
coronavirus 2019 ncov                    0.005114
coronavirus outbreak https 

In [12]:
#Recebe as centroids de cada cluster e as exibe
centroids = cluster.named_steps['kmeans'].cluster_centers_
for c in range(centroids.shape[0]):
    print("\nCluster {}".format(c))
    rank = pd.Series(centroids[c, :], index=terms).sort_values().tail(20)
    print(rank)


Cluster 0
coronavirus outbreak china               0.002015
coronavirus death toll                   0.002170
30 hours birth                           0.002325
accidentally leaked real                 0.002479
tencent accidentally leaked              0.002634
zhuang bing du                           0.002634
guan zhuang bing                         0.002634
just 30 hours                            0.002789
novel coronavirus 2019                   0.002944
cruise ship japan                        0.003099
2019 novel coronavirus                   0.003254
news china coronavirus                   0.003409
coronavirus outbreak https               0.003719
world health organization                0.003719
coronavirus coronaoutbreak coronanews    0.004184
coronaoutbreak coronanews ncov2019       0.004184
coronavirus asiannetwalking https        0.004339
health coronavirus asiannetwalking       0.004339
coronavirus 2019 ncov                    0.005114
amid coronavirus outbreak              

In [13]:
#Adiciona na tabela o cluster a cada qual palavra pertence
tweets['cluster'] = p

In [14]:
#Exibe uma mensagem de cada cluster
for c in np.unique(p):
    print('Cluster {} = {}'.format(c, tweets[tweets['cluster'] == c]['txt'].iloc[0]))
    print()

Cluster 0 = what the actual -

Cluster 1 = What is coronavirus? The biggest questions about the outbreak, answered. SURVIVE SEE DETAILS AT ==&gt;... https://t.co/21AGGnBj58

Cluster 2 = @IsChinar OxyBreath Pro
Highly Effective Anti-Pollution Clean Air Breathing Mask.
Full details please click on a li... https://t.co/uWq9DZ9XI9

Cluster 3 = A 37-year-old woman has become the 15th person in Australia diagnosed with coronavirus - the fifth in Queensland.... https://t.co/UtfPvHwjvR

Cluster 4 = Vals Is Here Surprise That Special Someone Now

You can reach Us On 0205414305or WhatsApp 0555171905 
For The Bes... https://t.co/E9CSZvxFCT

Cluster 5 = 

Can you help us by publishing this link to help raise Awareness of the suffering in #Syria

#Syrie
C'est dur a r... https://t.co/FnS9hjdXFo

Cluster 6 = Did China's Tencent Accidentally Leak The True Terrifying #Coronavirus Statistics https://t.co/gyd8C00Pxg

Cluster 7 = Coronavirus Latest Updates: Everything You Need to Know SEE DETAILS AT ==&gt