# Topic Detection General
## In this notebook:
For each time-specific dataset:
* We import the matrix of raw count C (tweet-term matrix) and the fitted Count Vectorizer (the vocabulary) cv;
* We fit the TFIDF method on the raw count matrix C;
* We do idf normalization: to avoid the + 1 of the idf term (added to avoid division by zero), we subtract 1 to all the values in the idf vector;
* We obtain the matrix X from the TFIDF and we fit the NMF method on the X;
* We save to file the NMF, the W and the H and we print the topics.

In [1]:
import pandas as pd
import numpy as np
from sklearn import decomposition
import joblib
from sklearn import feature_extraction
import scipy.sparse

In [2]:
def phrase_analyzer(text):
    words = [w for w in token_pattern.findall(text.lower()) if w not in stop_words]
    return bigram[words]

## pre-COVID

In [3]:
[C0, cv0] = joblib.load('/../data/counts_vocabulary_i.joblib')

In [4]:
%%time
#apply the tfidf weighting
tfidf = feature_extraction.text.TfidfTransformer()
tfidf.fit(C0)

CPU times: user 9.13 ms, sys: 732 µs, total: 9.86 ms
Wall time: 9.12 ms


TfidfTransformer()

In [5]:
%%time
#tfidf normalization: to avoid the + 1 of the idf term (added to avoid division by zero)
tfidf.idf_ = tfidf.idf_ - 1
X = tfidf.transform(C0)

CPU times: user 57.3 ms, sys: 83 µs, total: 57.4 ms
Wall time: 56.6 ms


In [6]:
%%time
#fit the non-negative matrix factorization
nr_topics = 20
nmf = decomposition.NMF(nr_topics,
                        beta_loss='frobenius', solver='cd',
                        init='nndsvd', random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

CPU times: user 28.5 s, sys: 3.83 s, total: 32.3 s
Wall time: 18.8 s


In [7]:
W.shape, H.shape

((204275, 20), (20, 7458))

In [8]:
#save the fitted tfidf object
joblib.dump(tfidf, '/../data/tfidf_i.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/tfidf_preCOVID.joblib']

In [9]:
#saving nmf results
joblib.dump([W,H,nmf], '/../data/WHnmf_i.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/WHnmf_preCOVID.joblib']

In [10]:
#contains the vocabulary
cv0

CountVectorizer(analyzer=<function phrase_analyzer at 0x7fbea8286c10>,
                max_df=0.5, min_df=10,
                stop_words={'a', 'abbia', 'abbiamo', 'abbiano', 'abbiate',
                            'about', 'above', 'ad', 'after', 'again', 'against',
                            'agl', 'agli', 'ah', 'ai', 'ain', 'al', 'alcuni',
                            'all', 'alla', 'alle', 'allo', 'allora', 'altre',
                            'altri', 'altro', 'am', 'an', 'anche', 'ancora', ...})

In [11]:
#printing topics
feature_names = np.array(cv0.get_feature_names())
topic_strength = W.sum(axis=0)
for i in topic_strength.argsort()[::-1]:
    topic_words = feature_names[np.argsort(H[i])[::-1][:10]]
    print("[T %d] Stenght: %.2f, words: " % (i, topic_strength[i]), ",".join(topic_words))

[T 15] Stenght: 1146.24, words:  anni,influenza,bambini,stato,medici,legge,due,antinfluenzale,salute,bimbo
[T 1] Stenght: 625.56, words:  info,trova,kg,mesi,maschio,affida,chip,adozione,taglia_media,futura_taglia
[T 2] Stenght: 608.47, words:  chip_gratuiti,abruzzo,tel,nord,canile,giorni,nord_arriva,ovunque,tg_media,cuccioli
[T 9] Stenght: 545.02, words:  castrato,sano,socievole,francesca,microchip,preaffido,anno,compatibile,box,info
[T 0] Stenght: 513.95, words:  figlie,ucciderli,disagio,scema,spieghiamo,incatenata,crea,spiegare,vaccina,contatto
[T 6] Stenght: 417.79, words:  sciopero,fame,decenni,male,studi,creare,figli,gente,ivrea,asilo
[T 4] Stenght: 404.99, words:  roma,fine,coccole,macchia,gioca,maschi_femmine,moglie,troppo,canile,chippato
[T 3] Stenght: 390.77, words:  morbillo,congo,epidemia,casi,morti,robertoburioni,mila,bambini,finora,ideologia
[T 12] Stenght: 376.39, words:  pd,obbligo,gt,mes,tav,governo,tap,stopmes,approvata,tap_tav
[T 14] Stenght: 370.60, words:  cucciolo,

In [12]:
del C0, cv0

## early-COVID

In [13]:
[C1, cv1] = joblib.load('/../data/counts_vocabulary_ii.joblib')

In [14]:
%%time
#apply the tfidf weighting
tfidf = feature_extraction.text.TfidfTransformer()
tfidf.fit(C1)

CPU times: user 6.15 ms, sys: 0 ns, total: 6.15 ms
Wall time: 5.28 ms


TfidfTransformer()

In [15]:
%%time
#tfidf normalization: to avoid the + 1 of the idf term (added to avoid division by zero)
tfidf.idf_ = tfidf.idf_ - 1
X = tfidf.transform(C1)

CPU times: user 29.6 ms, sys: 0 ns, total: 29.6 ms
Wall time: 29.1 ms


In [16]:
%%time
#fit the non-negative matrix factorization
nr_topics = 20
nmf = decomposition.NMF(nr_topics,
                        beta_loss='frobenius', solver='cd',
                        init='nndsvd', random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

CPU times: user 32.1 s, sys: 2.6 s, total: 34.7 s
Wall time: 24.3 s


In [17]:
W.shape, H.shape

((125887, 20), (20, 6171))

In [18]:
#save the fitted tfidf object
joblib.dump(tfidf, '/../data/tfidf_ii.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/tfidf_earlyCOVID.joblib']

In [19]:
#saving nmf results
joblib.dump([W,H,nmf], '/../data/WHnmf_ii.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/WHnmf_earlyCOVID.joblib']

In [20]:
#contains the vocabulary
cv1

CountVectorizer(analyzer=<function phrase_analyzer at 0x7fbea8286c10>,
                max_df=0.5, min_df=10,
                stop_words={'a', 'abbia', 'abbiamo', 'abbiano', 'abbiate',
                            'about', 'above', 'ad', 'after', 'again', 'against',
                            'agl', 'agli', 'ah', 'ai', 'ain', 'al', 'alcuni',
                            'all', 'alla', 'alle', 'allo', 'allora', 'altre',
                            'altri', 'altro', 'am', 'an', 'anche', 'ancora', ...})

In [21]:
#printing topics
feature_names = np.array(cv1.get_feature_names())
topic_strength = W.sum(axis=0)
for i in topic_strength.argsort()[::-1]:
    topic_words = feature_names[np.argsort(H[i])[::-1][:10]]
    print("[T %d] Stenght: %.2f, words: " % (i, topic_strength[i]), ",".join(topic_words))

[T 16] Stenght: 676.96, words:  coronavirus,tempo,possibile,breve,spera,trovino,test,australiano_supera,laboratorio,aggiornamento
[T 14] Stenght: 639.21, words:  info,adozione,preaffido,sverminato,socievole,affida,taglia_media,castrato,cerca_casa,affido
[T 8] Stenght: 538.81, words:  virus,esiste,cinese,anni,gravi,trovare,gira,chiama,nuovo,paura
[T 0] Stenght: 458.99, words:  vite_umane,desidererei,implorare,ceci,trovasse,piacere,immediatamente,tante,ginocchio,motivi
[T 1] Stenght: 449.43, words:  mandare_scuola,garantire,cavernicoli,colpisce,improvvisamente,diventare,prevenzione,libertà,figli,vedere
[T 19] Stenght: 445.28, words:  salvini,morbillo,obbligo,scusate,abolizione,epidemia,chiudere_porti,difenderci,vera,corso
[T 13] Stenght: 417.75, words:  bambini,scuola,cina,esclusione,tornano,inclusiva,italiani,isolamento,andare_scuola,bambini_cinesi
[T 2] Stenght: 417.18, words:  arroganti,suggerito,cialtroni,egoisti,ignoranti,difficile,conferma,questione,troppi,paese
[T 18] Stenght: 374

In [22]:
del C1, cv1

## pre-VAX

In [23]:
[C2, cv2] = joblib.load('/../data/counts_vocabulary_iii.joblib')

In [24]:
%%time
#apply the tfidf weighting
tfidf = feature_extraction.text.TfidfTransformer()
tfidf.fit(C2)

CPU times: user 37.2 ms, sys: 8.16 ms, total: 45.4 ms
Wall time: 44.7 ms


TfidfTransformer()

In [25]:
%%time
#tfidf normalization: to avoid the + 1 of the idf term (added to avoid division by zero)
tfidf.idf_ = tfidf.idf_ - 1
X = tfidf.transform(C2)

CPU times: user 290 ms, sys: 31.9 ms, total: 321 ms
Wall time: 320 ms


In [26]:
%%time
#fit the non-negative matrix factorization
nr_topics = 20
nmf = decomposition.NMF(nr_topics,
                        beta_loss='frobenius', solver='cd',
                        init='nndsvd', random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

CPU times: user 3min 19s, sys: 21.2 s, total: 3min 40s
Wall time: 3min


In [27]:
W.shape, H.shape

((1036177, 20), (20, 32842))

In [28]:
#save the fitted tfidf object
joblib.dump(tfidf, '/../data/tfidf_iii.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/tfidf_preVAX.joblib']

In [29]:
#saving nmf results
joblib.dump([W,H,nmf], '/../data/WHnmf_iii.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/WHnmf_preVAX.joblib']

In [30]:
#contains the vocabulary
cv2

CountVectorizer(analyzer=<function phrase_analyzer at 0x7fbea8286c10>,
                max_df=0.5, min_df=10,
                stop_words={'a', 'abbia', 'abbiamo', 'abbiano', 'abbiate',
                            'about', 'above', 'ad', 'after', 'again', 'against',
                            'agl', 'agli', 'ah', 'ai', 'ain', 'al', 'alcuni',
                            'all', 'alla', 'alle', 'allo', 'allora', 'altre',
                            'altri', 'altro', 'am', 'an', 'anche', 'ancora', ...})

In [31]:
#printing topics
feature_names = np.array(cv2.get_feature_names())
topic_strength = W.sum(axis=0)
for i in topic_strength.argsort()[::-1]:
    topic_words = feature_names[np.argsort(H[i])[::-1][:10]]
    print("[T %d] Stenght: %.2f, words: " % (i, topic_strength[i]), ",".join(topic_words))

[T 14] Stenght: 3265.23, words:  virus,prima,esiste,serve,nessuno,vogliono,cura,governo,fino,niente
[T 1] Stenght: 2307.05, words:  info,mesi,trova,kg,anni,chippato,cucciolo,famiglia,chip,centro_nord
[T 3] Stenght: 2031.60, words:  coronavirus,test,ansa,pre_clinici,pronto,israele,anti,primo,italiano,istituto_migal
[T 5] Stenght: 1689.58, words:  sperimentazione,uomo,italiano,spallanzani,anti_covid,ricerca,iniziata,notizia,test,volontari
[T 15] Stenght: 1686.86, words:  bill_gates,oms,governo,astrazeneca,effetti_collaterali,schöning,persone,bambini,milioni,italiani
[T 19] Stenght: 1478.21, words:  influenza,cinese,vinto,gara,lombardia_azienda,certificato_aifa,lombardia,anziani,stato,bambini
[T 16] Stenght: 1332.31, words:  antinfluenzali,euro,dosi,prezzo,marzo,rifiutato,milioni,comprato,farmaco,cinque_volte
[T 4] Stenght: 1284.49, words:  putin,russia,figlia,russo,primo,esprimono_dubbi,scacco_matto,trasformato_fanatici,decantavano_fino,cartello_criminale
[T 9] Stenght: 1276.31, words:  

In [32]:
del C2, cv2

## early-VAX

In [33]:
[C3, cv3] = joblib.load('/../data/counts_vocabulary_iv.joblib')

In [34]:
%%time
#apply the tfidf weighting
tfidf = feature_extraction.text.TfidfTransformer()
tfidf.fit(C3)

CPU times: user 205 ms, sys: 32.1 ms, total: 237 ms
Wall time: 235 ms


TfidfTransformer()

In [35]:
%%time
#tfidf normalization: to avoid the + 1 of the idf term (added to avoid division by zero)
tfidf.idf_ = tfidf.idf_ - 1
X = tfidf.transform(C3)

CPU times: user 1.72 s, sys: 120 ms, total: 1.84 s
Wall time: 1.84 s


In [36]:
%%time
#fit the non-negative matrix factorization
nr_topics = 20
nmf = decomposition.NMF(nr_topics,
                        beta_loss='frobenius', solver='cd',
                        init='nndsvd', random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

CPU times: user 16min 7s, sys: 1min 10s, total: 17min 18s
Wall time: 15min 45s


In [37]:
W.shape, H.shape

((5137559, 20), (20, 95428))

In [38]:
#save the fitted tfidf object
joblib.dump(tfidf, '/../data/tfidf_iv.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/tfidf_earlyVAX.joblib']

In [39]:
#saving nmf results
joblib.dump([W,H,nmf], '/../data/WHnmf_iv.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/WHnmf_earlyVAX.joblib']

In [40]:
#contains the vocabulary
cv3

CountVectorizer(analyzer=<function phrase_analyzer at 0x7fbea8286c10>,
                max_df=0.5, min_df=10,
                stop_words={'a', 'abbia', 'abbiamo', 'abbiano', 'abbiate',
                            'about', 'above', 'ad', 'after', 'again', 'against',
                            'agl', 'agli', 'ah', 'ai', 'ain', 'al', 'alcuni',
                            'all', 'alla', 'alle', 'allo', 'allora', 'altre',
                            'altri', 'altro', 'am', 'an', 'anche', 'ancora', ...})

In [41]:
#printing topics
feature_names = np.array(cv3.get_feature_names())
topic_strength = W.sum(axis=0)
for i in topic_strength.argsort()[::-1]:
    topic_words = feature_names[np.argsort(H[i])[::-1][:10]]
    print("[T %d] Stenght: %.2f, words: " % (i, topic_strength[i]), ",".join(topic_words))

[T 4] Stenght: 10539.78, words:  stato,obbligo,medici,draghi,vogliono,governo,nessuno,mesi,casa,gente
[T 1] Stenght: 8450.46, words:  covid,morto,muore,positivi,vaccinoanticovid,gennaio,muori,decessi,positivo,vaccinocovid
[T 2] Stenght: 6475.15, words:  astrazeneca,ema,aifa,trombosi,somministrazione,sospeso,sicuro,effetti_collaterali,lotto,sospensione
[T 17] Stenght: 5476.49, words:  virus,lockdown,varianti,serve,malattia,anno,rna,efficacia,trasmissione,dati
[T 18] Stenght: 5039.77, words:  campagna,anticovid,corso,mattina,roma,precedenti,istituto_spallanzani,presidente_mattarella,nati,coloro
[T 7] Stenght: 4840.33, words:  anni,riflettete,antipolio,diversa,foto,potuto,morto,muore,due,mesi
[T 11] Stenght: 4807.17, words:  prima,giorni,vaccinata,persona,giornalisti,volta,scanzi,anziani,secondo,notizia
[T 5] Stenght: 4677.28, words:  pfizer,funziona,moderna,efficace,dati,efficacia,israele,ceo,effetti_collaterali,seconda_dose
[T 3] Stenght: 4498.25, words:  milioni,dosi,ue,due,somministra

In [42]:
del C3, cv3

## VAX-drive

In [111]:
[C4, cv4] = joblib.load('/../data/counts_vocabulary_v.joblib')

In [44]:
%%time
#apply the tfidf weighting
tfidf = feature_extraction.text.TfidfTransformer()
tfidf.fit(C4)

CPU times: user 162 ms, sys: 23.8 ms, total: 186 ms
Wall time: 184 ms


TfidfTransformer()

In [45]:
%%time
#tfidf normalization: to avoid the + 1 of the idf term (added to avoid division by zero)
tfidf.idf_ = tfidf.idf_ - 1
X = tfidf.transform(C4)

CPU times: user 1.33 s, sys: 92.2 ms, total: 1.43 s
Wall time: 1.43 s


In [46]:
%%time
#fit the non-negative matrix factorization
nr_topics = 20
nmf = decomposition.NMF(nr_topics,
                        beta_loss='frobenius', solver='cd',
                        init='nndsvd', random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

CPU times: user 15min 32s, sys: 1min 6s, total: 16min 39s
Wall time: 15min 12s


In [47]:
W.shape, H.shape

((4160533, 20), (20, 85922))

In [48]:
#save the fitted tfidf object
joblib.dump(tfidf, '/../data/tfidf_v.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/tfidf_VAXdrive.joblib']

In [53]:
#saving nmf results
joblib.dump([W,H,nmf], '/../data/WHnmf_v.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/WHnmf_VAXdrive.joblib']

In [54]:
#contains the vocabulary
cv4

NameError: name 'cv4' is not defined

In [113]:
#printing topics
feature_names = np.array(cv4.get_feature_names())
topic_strength = W.sum(axis=0)
for i in topic_strength.argsort()[::-1]:
    topic_words = feature_names[np.argsort(H[i])[::-1][:10]]
    print("[T %d] Stenght: %.2f, words: " % (i, topic_strength[i]), ",".join(topic_words))

[T 5] Stenght: 6250.78, words:  covid,ricoverati,bambini,ospedale,casi,giugno,decessi,positivi,preso,cura
[T 0] Stenght: 5519.50, words:  greenpass,andare,vogliono,casa,gente,niente,pass,nessuno,draghi,basta
[T 17] Stenght: 4943.84, words:  persone,giorno,mila,morte,decessi,ricevuto,contagiose,migliaia,completamente,possono
[T 16] Stenght: 4665.76, words:  virus,variante_delta,varianti,serve,circolazione,rischio,contrarre,mattarella,fauci,dovere_morale
[T 12] Stenght: 4102.39, words:  pfizer,seconda_dose,israele,efficacia,dosi,giorni,variante_delta,prima_dose,muore,dose
[T 18] Stenght: 3974.52, words:  stato,dose,almeno,vaiolo,emergenza,diritto,sperimentale,popolazione,completamente,muori
[T 13] Stenght: 3853.36, words:  prima,seconda_dose,giorno,vaccinatevi,tampone,volta,adesso,nomask,nopass,pragmatica
[T 9] Stenght: 3692.78, words:  milioni,italiani,dosi,dose,stati,coronavirus,ricevuto_almeno,figliuolo,dati_aggiornati,colpiti
[T 4] Stenght: 3636.65, words:  anni,muore,morta,due,foto,

In [None]:
del C4, cv4

## late-VAX

In [55]:
[C5, cv5] = joblib.load('/../data/counts_vocabulary_vi.joblib')

In [56]:
%%time
#apply the tfidf weighting
tfidf = feature_extraction.text.TfidfTransformer()
tfidf.fit(C5)

CPU times: user 239 ms, sys: 32.1 ms, total: 271 ms
Wall time: 269 ms


TfidfTransformer()

In [57]:
%%time
#tfidf normalization: to avoid the + 1 of the idf term (added to avoid division by zero)
tfidf.idf_ = tfidf.idf_ - 1
X = tfidf.transform(C5)

CPU times: user 1.79 s, sys: 148 ms, total: 1.94 s
Wall time: 1.94 s


In [58]:
%%time
#fit the non-negative matrix factorization
nr_topics = 20
nmf = decomposition.NMF(nr_topics,
                        beta_loss='frobenius', solver='cd',
                        init='nndsvd', random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_



CPU times: user 26min 42s, sys: 1min 50s, total: 28min 32s
Wall time: 26min 54s


In [59]:
W.shape, H.shape

((5450244, 20), (20, 100285))

In [60]:
#save the fitted tfidf object
joblib.dump(tfidf, '/../data/tfidf_vi.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/tfidf_lateVAX.joblib']

In [61]:
#saving nmf results
joblib.dump([W,H,nmf], '/../data/WHnmf_vi.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/WHnmf_lateVAX.joblib']

In [62]:
#contains the vocabulary
cv5

CountVectorizer(analyzer=<function phrase_analyzer at 0x7fbea8286c10>,
                max_df=0.5, min_df=10,
                stop_words={'a', 'abbia', 'abbiamo', 'abbiano', 'abbiate',
                            'about', 'above', 'ad', 'after', 'again', 'against',
                            'agl', 'agli', 'ah', 'ai', 'ain', 'al', 'alcuni',
                            'all', 'alla', 'alle', 'allo', 'allora', 'altre',
                            'altri', 'altro', 'am', 'an', 'anche', 'ancora', ...})

In [63]:
#printing topics
feature_names = np.array(cv5.get_feature_names())
topic_strength = W.sum(axis=0)
for i in topic_strength.argsort()[::-1]:
    topic_words = feature_names[np.argsort(H[i])[::-1][:10]]
    print("[T %d] Stenght: %.2f, words: " % (i, topic_strength[i]), ",".join(topic_words))

[T 18] Stenght: 14124.08, words:  virus,serve,gente,paura,cazzo,capito,medico,nessuno,funziona,vero
[T 15] Stenght: 6507.83, words:  tamponi,tampone,gratis,lavoro,lavoratori,pagare,vogliono,settimana,lavorare,ore
[T 5] Stenght: 5847.54, words:  green_pass,obbligatorio,governo,giorno,nogreenpass,trieste,contagi,possesso,strumento,nulla
[T 3] Stenght: 5735.92, words:  obbligo,passaporto_sanitario,draghi,manifestazione,governo,grande,operatori_sanitari,legge,massa,sanitari
[T 13] Stenght: 5229.42, words:  prima,dose,almeno,ricevuto,morto,seconda_dose,volta,enne,seconda,popolazione
[T 17] Stenght: 5147.79, words:  mesi,due,protezione,efficacia,anno,casa,immunità,durata,dosi,richiamo
[T 9] Stenght: 4757.94, words:  greenpass,nogreenpass,greenpassobbligatorio,governo,obbligatorio,draghi,serve,gran_bretagna,contrari,ricatto
[T 19] Stenght: 4594.20, words:  persone,morte,migliaia,muoiono,causa,manifestazioni,vogliono,possono,decessi,miliardi
[T 4] Stenght: 4395.50, words:  anni,muore,medico,mo

In [64]:
del C5, cv5