# Topic Detection General
## In this notebook:
For each time-specific dataset:
* We import the matrix of raw count C (tweet-term matrix) and the fitted Count Vectorizer (the vocabulary) cv;
* We fit the TFIDF method on the raw count matrix C;
* We do idf normalization: to avoid the + 1 of the idf term (added to avoid division by zero), we subtract 1 to all the values in the idf vector;
* We obtain the matrix X from the TFIDF and we fit the NMF method on the X;
* We save to file the NMF, the W and the H and we print the topics.

In [None]:
import pandas as pd
import numpy as np
from sklearn import decomposition
import joblib
from sklearn import feature_extraction
import scipy.sparse

In [None]:
def phrase_analyzer(text):
    words = [w for w in token_pattern.findall(text.lower()) if w not in stop_words]
    return bigram[words]

## pre-COVID

In [None]:
[C0, cv0] = joblib.load('/../data/counts_vocabulary_i.joblib')

In [None]:
%%time
#apply the tfidf weighting
tfidf = feature_extraction.text.TfidfTransformer()
tfidf.fit(C0)

In [None]:
%%time
#tfidf normalization: to avoid the + 1 of the idf term (added to avoid division by zero)
tfidf.idf_ = tfidf.idf_ - 1
X = tfidf.transform(C0)

In [None]:
%%time
#fit the non-negative matrix factorization
nr_topics = 20
nmf = decomposition.NMF(nr_topics,
                        beta_loss='frobenius', solver='cd',
                        init='nndsvd', random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

In [None]:
W.shape, H.shape

In [None]:
#save the fitted tfidf object
joblib.dump(tfidf, '/../data/tfidf_i.joblib', compress=6)

In [None]:
#saving nmf results
joblib.dump([W,H,nmf], '/../data/WHnmf_i.joblib', compress=6)

In [None]:
#contains the vocabulary
cv0

In [None]:
#printing topics
feature_names = np.array(cv0.get_feature_names())
topic_strength = W.sum(axis=0)
for i in topic_strength.argsort()[::-1]:
    topic_words = feature_names[np.argsort(H[i])[::-1][:10]]
    print("[T %d] Stenght: %.2f, words: " % (i, topic_strength[i]), ",".join(topic_words))

In [None]:
del C0, cv0

## early-COVID

In [None]:
[C1, cv1] = joblib.load('/../data/counts_vocabulary_ii.joblib')

In [None]:
%%time
#apply the tfidf weighting
tfidf = feature_extraction.text.TfidfTransformer()
tfidf.fit(C1)

In [None]:
%%time
#tfidf normalization: to avoid the + 1 of the idf term (added to avoid division by zero)
tfidf.idf_ = tfidf.idf_ - 1
X = tfidf.transform(C1)

In [None]:
%%time
#fit the non-negative matrix factorization
nr_topics = 20
nmf = decomposition.NMF(nr_topics,
                        beta_loss='frobenius', solver='cd',
                        init='nndsvd', random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

In [None]:
W.shape, H.shape

In [None]:
#save the fitted tfidf object
joblib.dump(tfidf, '/../data/tfidf_ii.joblib', compress=6)

In [None]:
#saving nmf results
joblib.dump([W,H,nmf], '/../data/WHnmf_ii.joblib', compress=6)

In [None]:
#contains the vocabulary
cv1

In [None]:
#printing topics
feature_names = np.array(cv1.get_feature_names())
topic_strength = W.sum(axis=0)
for i in topic_strength.argsort()[::-1]:
    topic_words = feature_names[np.argsort(H[i])[::-1][:10]]
    print("[T %d] Stenght: %.2f, words: " % (i, topic_strength[i]), ",".join(topic_words))

In [None]:
del C1, cv1

## pre-VAX

In [None]:
[C2, cv2] = joblib.load('/../data/counts_vocabulary_iii.joblib')

In [None]:
%%time
#apply the tfidf weighting
tfidf = feature_extraction.text.TfidfTransformer()
tfidf.fit(C2)

In [None]:
%%time
#tfidf normalization: to avoid the + 1 of the idf term (added to avoid division by zero)
tfidf.idf_ = tfidf.idf_ - 1
X = tfidf.transform(C2)

In [None]:
%%time
#fit the non-negative matrix factorization
nr_topics = 20
nmf = decomposition.NMF(nr_topics,
                        beta_loss='frobenius', solver='cd',
                        init='nndsvd', random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

In [None]:
W.shape, H.shape

In [None]:
#save the fitted tfidf object
joblib.dump(tfidf, '/../data/tfidf_iii.joblib', compress=6)

In [None]:
#saving nmf results
joblib.dump([W,H,nmf], '/../data/WHnmf_iii.joblib', compress=6)

In [None]:
#contains the vocabulary
cv2

In [None]:
#printing topics
feature_names = np.array(cv2.get_feature_names())
topic_strength = W.sum(axis=0)
for i in topic_strength.argsort()[::-1]:
    topic_words = feature_names[np.argsort(H[i])[::-1][:10]]
    print("[T %d] Stenght: %.2f, words: " % (i, topic_strength[i]), ",".join(topic_words))

In [None]:
del C2, cv2

## early-VAX

In [None]:
[C3, cv3] = joblib.load('/../data/counts_vocabulary_iv.joblib')

In [None]:
%%time
#apply the tfidf weighting
tfidf = feature_extraction.text.TfidfTransformer()
tfidf.fit(C3)

In [None]:
%%time
#tfidf normalization: to avoid the + 1 of the idf term (added to avoid division by zero)
tfidf.idf_ = tfidf.idf_ - 1
X = tfidf.transform(C3)

In [None]:
%%time
#fit the non-negative matrix factorization
nr_topics = 20
nmf = decomposition.NMF(nr_topics,
                        beta_loss='frobenius', solver='cd',
                        init='nndsvd', random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

In [None]:
W.shape, H.shape

In [None]:
#save the fitted tfidf object
joblib.dump(tfidf, '/../data/tfidf_iv.joblib', compress=6)

In [None]:
#saving nmf results
joblib.dump([W,H,nmf], '/../data/WHnmf_iv.joblib', compress=6)

In [None]:
#contains the vocabulary
cv3

In [None]:
#printing topics
feature_names = np.array(cv3.get_feature_names())
topic_strength = W.sum(axis=0)
for i in topic_strength.argsort()[::-1]:
    topic_words = feature_names[np.argsort(H[i])[::-1][:10]]
    print("[T %d] Stenght: %.2f, words: " % (i, topic_strength[i]), ",".join(topic_words))

In [None]:
del C3, cv3

## VAX-drive

In [None]:
[C4, cv4] = joblib.load('/../data/counts_vocabulary_v.joblib')

In [None]:
%%time
#apply the tfidf weighting
tfidf = feature_extraction.text.TfidfTransformer()
tfidf.fit(C4)

In [None]:
%%time
#tfidf normalization: to avoid the + 1 of the idf term (added to avoid division by zero)
tfidf.idf_ = tfidf.idf_ - 1
X = tfidf.transform(C4)

In [None]:
%%time
#fit the non-negative matrix factorization
nr_topics = 20
nmf = decomposition.NMF(nr_topics,
                        beta_loss='frobenius', solver='cd',
                        init='nndsvd', random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

In [None]:
W.shape, H.shape

In [None]:
#save the fitted tfidf object
joblib.dump(tfidf, '/../data/tfidf_v.joblib', compress=6)

In [None]:
#saving nmf results
joblib.dump([W,H,nmf], '/../data/WHnmf_v.joblib', compress=6)

In [None]:
#contains the vocabulary
cv4

In [None]:
#printing topics
feature_names = np.array(cv4.get_feature_names())
topic_strength = W.sum(axis=0)
for i in topic_strength.argsort()[::-1]:
    topic_words = feature_names[np.argsort(H[i])[::-1][:10]]
    print("[T %d] Stenght: %.2f, words: " % (i, topic_strength[i]), ",".join(topic_words))

In [None]:
del C4, cv4

## late-VAX

In [None]:
[C5, cv5] = joblib.load('/../data/counts_vocabulary_vi.joblib')

In [None]:
%%time
#apply the tfidf weighting
tfidf = feature_extraction.text.TfidfTransformer()
tfidf.fit(C5)

In [None]:
%%time
#tfidf normalization: to avoid the + 1 of the idf term (added to avoid division by zero)
tfidf.idf_ = tfidf.idf_ - 1
X = tfidf.transform(C5)

In [None]:
%%time
#fit the non-negative matrix factorization
nr_topics = 20
nmf = decomposition.NMF(nr_topics,
                        beta_loss='frobenius', solver='cd',
                        init='nndsvd', random_state=42)
W = nmf.fit_transform(X)
H = nmf.components_

In [None]:
W.shape, H.shape

In [None]:
#save the fitted tfidf object
joblib.dump(tfidf, '/../data/tfidf_vi.joblib', compress=6)

In [None]:
#saving nmf results
joblib.dump([W,H,nmf], '/../data/WHnmf_vi.joblib', compress=6)

In [None]:
#contains the vocabulary
cv5

In [None]:
#printing topics
feature_names = np.array(cv5.get_feature_names())
topic_strength = W.sum(axis=0)
for i in topic_strength.argsort()[::-1]:
    topic_words = feature_names[np.argsort(H[i])[::-1][:10]]
    print("[T %d] Stenght: %.2f, words: " % (i, topic_strength[i]), ",".join(topic_words))

In [None]:
del C5, cv5