# Preliminary Topic Detection General 
## In this notebook:
* We import and manipulate the tweets dataframe by adding the master text and sorting tweets by date;
* We clean the text and add the respective field to the dataframe (we do not remove hashtags and mentions);
* For each time window:
    * We build the vocabulary by fitting the count vectorizer on the clean_text field (WITHOUT TEXT DUPLICATES). We also include bigrams in the vocabulary;
    * We obtain the Tweet-Term-Matrix (C2) with all the tweets (not just the ones with unique piece of text) and we save it to file;
    * We save the count vectorizer (vocabulary) to file;

In [None]:
import pandas as pd
import numpy as np
from IPython.display import clear_output
import random
import scipy.sparse
from scipy.sparse import hstack, coo_matrix, vstack
from sklearn import feature_extraction
import joblib

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
import regex as re
import nltk
from nltk.corpus import stopwords

In [None]:
from gensim.models.phrases import Phraser, Phrases

In [None]:
%%time
#import the tweets_df

tweets_df = pd.read_csv('/home/gcrupi/6_time_windows/github/tweets_example.csv').drop(['Unnamed: 0'],axis=1)

In [None]:
tweets_df.count()

### Transforming 'created_at' fields into datetime objects and sorting tweets by date

In [None]:
#to_datetime transforms strings containing dates into datetime objects. to_datetime returns a pd Series with indices
#the same indices of the rt_id_df and with values datetime objects
#So I first get two pd Series containing infos of 'created_at' and 'created_at_rt' fields
cr_at_series = pd.to_datetime(tweets_df['created_at'], format = '%a %b %d %H:%M:%S +0000 %Y')

#then I turn the two series into two temporary dataframes.
temp_df1 = cr_at_series.to_frame()
temp_df1.columns = ['created_at_datetime']

#substituting the old string-form fields with new datetime-form fields
tweets_df = tweets_df.drop(['created_at'],axis=1)

tweets_df.insert(loc=2, column='created_at',value=temp_df1['created_at_datetime'],allow_duplicates=True)

del temp_df1

In [None]:
sort_tweets_df = tweets_df.sort_values(by=['created_at'])

In [None]:
del tweets_df

In [None]:
#defining the splitting dates
sep_5th_19 = pd.Timestamp(2019,9,5)
jan_1st_20 = pd.Timestamp(2020,1,1)
mar_9th_20 = pd.Timestamp(2020,3,9)
nov_1st_20 = pd.Timestamp(2020,11,1)
apr_17_21 = pd.Timestamp(2021,4,17)
aug_1st_21 = pd.Timestamp(2021,8,1)
nov_8th_21 = pd.Timestamp(2021,11,8)

In [None]:
len(sort_tweets_df)

In [None]:
#removing self loops
self_index = sort_tweets_df[sort_tweets_df['id_usr']==sort_tweets_df['id_usr_rt']].index
sort_tweets_df = sort_tweets_df.drop(self_index)

In [None]:
len(sort_tweets_df)

In [None]:
#changing the indices of the sorted df so that they go from 0 to len(sort_tweets_df)-1
ni = np.arange(len(sort_tweets_df)) #new indices     
s = pd.Series(ni) #I turn my 'new indices' numpy array into a pandas series
sort_tweets_df = sort_tweets_df.set_index([s]) #and use this series to change the indices of the dataframe 

In [None]:
#removing the 'id_usr_rt' field
sort_tweets_df = sort_tweets_df.drop(['id_usr_rt'],axis=1)

## Defining stopwords

In [None]:
stopwords.words('italian') #'forse', 'qualche', 'qualcosa', 'chissà', 'po', 'stata', 'fatta', 'fatto', 'alcuni', 
#'quasi', 'oltre', 'fate', 'to', 'farne', 'far', 'ecco', 'però', 'sì', 'circa', 'state', 'ok', 'magari', 'so', 
#'ieri', 'oggi', 'stare', 'perchè', 'eh', 'ah', 'vabbè', 'ce', 'fra', 'proprio', 'te', 'pensa', 'vuoi', 'sai', 
#'puoi', 'devi', 'vai', 'fatti', 'guarda', 'dico', 'sa', 'sti', 'allora', 'tutte','altre', 'comunque', 'avere', 'deve'

In [None]:
stop = stopwords.words('italian')
en_stop = stopwords.words('english')
query = ['vacc', 'vaccinale', 'vaccinali', 'vaccinano', 'vaccinarci', 'vaccinare', 'vaccinarsi',
               'vaccinate', 'vaccinati', 'vaccinato', 'vaccinaz', 'vaccinazione', 'vaccinazioni', 'vaccines','vax','vaccine',
               'vaccini', 'vaccinista', 'vaccinisti', 'vaccino', 'antivaccinisti', 'freevax', 'iovaccino', 
               'nonvaccinato', 'novax', 'obbligovaccinale', 'provax', 'ridacciilvaccino','vaccine']

re_url = re.compile(r'https?:\/\/.*[\r\n]*', flags=re.U)
#re_rtw = re.compile(r'RT', flags=re.U)
re_htg = re.compile(r'#', flags=re.U) # remove hashtag sign
#re_htg = re.compile(r'#[\w]+ ?', flags=re.U)   # remove hashtags
re_hnd = re.compile(r'@', flags=re.U)
#re_hnd = re.compile(r'@\w+ ?', flags=re.U)
re_wrd = re.compile(r'[^\w]+ ', flags=re.U)
re_num = re.compile(r'[0-9]+', flags=re.U)

def cleantext(txt):
    t = txt
    t = re_url.sub('', t)
    #t = re_htg.sub('', t)
    #t = re_rtw.sub(' ', t)
    t = re_hnd.sub(' ', t)
    t = re_wrd.sub(' ', t)
    t = re_num.sub(' ', t)
    return t.strip().lower()

In [None]:
#defining stop words
more_stop = ['già','poi','solo','no','fa','può','quindi','quando','x','ogni','altro','così','mai','tutta','ancora',
            'ora', 'molto','d', 'via','sempre','rt','co','https','dopo','fare','fatto','italia','essere','cosa',
            'oggi','bene','dire','dice','vuole','vaccinati','vaccino','vaccini','vaccinato','senza','altri','me',
             'detto','meno','invece','va','grazie']
            
more_more = ['forse', 'qualche', 'qualcosa', 'chissà', 'po', 'stata', 'fatta', 'fatto', 'alcuni', 
            'quasi', 'oltre', 'fate', 'to', 'farne', 'far', 'ecco', 'però', 'sì', 'circa', 'state', 'ok', 'magari', 'so', 
            'ieri', 'oggi', 'stare', 'perchè', 'eh', 'ah', 'vabbè', 'ce', 'fra', 'proprio', 'te', 'pensa', 'vuoi', 'sai', 
            'puoi', 'devi', 'vai', 'fatti', 'guarda', 'dico', 'sa', 'sti', 'allora', 'tutte','altre', 'comunque', 'avere', 'deve']

stop_words = set(stop+query+en_stop+more_stop+more_more)

# pre-COVID

In [None]:
tw0_df = sort_tweets_df[sort_tweets_df['created_at'] < jan_1st_20]

In [None]:
tw0_df.count()

## Text cleaning

In [None]:
%%time
#cleaning the text and adding a 'clean_text' field
tw0_df['clean_text'] = tw0_df['master_text'].apply(lambda txt: cleantext(txt))

In [None]:
tw0_df.head()

In [None]:
len(tw0_df), len(tw0_df.drop_duplicates(subset=['master_text']))

## Phraser
In this section we add bigrams to the vocabulary: namely couple of words which often go together (i.e. "green_pass" or "new_york" etc...).

In [None]:
def phrase_analyzer(text):
    words = [w for w in token_pattern.findall(text.lower()) if w not in stop_words]
    return bigram[words]

In [None]:
%%time
token_pattern = re.compile(r'(?u)\b[A-Za-z]\w+\b')
text_sentences = []
#building the dictionary with unique pieces of text (in other words dropping duplicated on the clean_text column)
for doc in tw0_df.clean_text.drop_duplicates():
    text_sentences.extend([token_pattern.findall(sent.lower()) for sent in doc.split('\n') if len(sent) > 0])

In [None]:
%%time
#min_count is the minimal nuimber of times that a single bigram has to appear in order to be considered a real bigram
#threshold is linked to the probability of observing the words of the bigram together and the probability of 
#observing them separately
phrases = Phrases(text_sentences, min_count=10, threshold=20., common_terms=stop_words) #, scoring='npmi')
bigram = Phraser(phrases)

## Fitting the count vectorizer on the clean_text field

In [None]:
%%time
cv2 = feature_extraction.text.CountVectorizer(min_df=10, max_df=0.5, stop_words=stop_words, analyzer=phrase_analyzer)
#building the vocabulary with unique pieces of text (in other words dropping duplicates in the clean_text column)
cv2.fit(tw0_df.clean_text.drop_duplicates())

In [None]:
%%time
#here I just obtain the matrix of counts of all the tweets, but with the vocabulary built with the unique 
#pieces of text only
C2 = cv2.transform(tw0_df.clean_text)

In [None]:
del tw0_df

In [None]:
C2

In [None]:
#saving the raw count matrix C2 and the vocabulary

joblib.dump([C2,cv2], '/../data/counts_vocabulary_i.joblib', compress=6)

# early-COVID

In [None]:
idx = np.where((sort_tweets_df['created_at'] >= jan_1st_20) & (sort_tweets_df['created_at'] < mar_9th_20))

tw1_df = sort_tweets_df.loc[idx]

In [None]:
tw1_df.count()

## Text cleaning

In [None]:
%%time
#cleaning the text and adding a 'clean_text' field
tw1_df['clean_text'] = tw1_df['master_text'].apply(lambda txt: cleantext(txt))

In [None]:
tw1_df.head()

In [None]:
len(tw1_df), len(tw1_df.drop_duplicates(subset=['master_text']))

## Phraser
In this section we add bigrams to the vocabulary: namely couple of words which often go together (i.e. "green_pass" or "new_york" etc...).

In [None]:
%%time
token_pattern = re.compile(r'(?u)\b[A-Za-z]\w+\b')
text_sentences = []
#building the dictionary with unique pieces of text (in other words dropping duplicated on the clean_text column)
for doc in tw1_df.clean_text.drop_duplicates():
    text_sentences.extend([token_pattern.findall(sent.lower()) for sent in doc.split('\n') if len(sent) > 0])

In [None]:
%%time
#min_count is the minimal nuimber of times that a single bigram has to appear in order to be considered a real bigram
#threshold is linked to the probability of observing the words of the bigram together and the probability of 
#observing them separately
phrases = Phrases(text_sentences, min_count=10, threshold=20., common_terms=stop_words) #, scoring='npmi')
bigram = Phraser(phrases)

## Fitting the count vectorizer on the clean_text field

In [None]:
%%time
cv2 = feature_extraction.text.CountVectorizer(min_df=10, max_df=0.5, stop_words=stop_words, analyzer=phrase_analyzer)
#building the vocabulary with unique pieces of text (in other words dropping duplicates in the clean_text column)
cv2.fit(tw1_df.clean_text.drop_duplicates())

In [None]:
%%time
#here I just obtain the matrix of counts of all the tweets, but with the vocabulary built with the unique 
#pieces of text only
C2 = cv2.transform(tw1_df.clean_text)

In [None]:
del tw1_df

In [None]:
C2

In [None]:
#saving the raw count matrix C2 and the vocabulary

joblib.dump([C2,cv2], '/../data/counts_vocabulary_ii.joblib', compress=6)

# pre-VAX

In [None]:
idx = np.where((sort_tweets_df['created_at'] >= mar_9th_20) & (sort_tweets_df['created_at'] < nov_1st_20))

tw2_df = sort_tweets_df.loc[idx]

In [None]:
tw2_df.count()

## Text cleaning

In [None]:
%%time
#cleaning the text and adding a 'clean_text' field
tw2_df['clean_text'] = tw2_df['master_text'].apply(lambda txt: cleantext(txt))

In [None]:
tw2_df.head()

In [None]:
len(tw2_df), len(tw2_df.drop_duplicates(subset=['master_text']))

## Phraser
In this section we add bigrams to the vocabulary: namely couple of words which often go together (i.e. "green_pass" or "new_york" etc...).

In [None]:
%%time
token_pattern = re.compile(r'(?u)\b[A-Za-z]\w+\b')
text_sentences = []
#building the dictionary with unique pieces of text (in other words dropping duplicated on the clean_text column)
for doc in tw2_df.clean_text.drop_duplicates():
    text_sentences.extend([token_pattern.findall(sent.lower()) for sent in doc.split('\n') if len(sent) > 0])

In [None]:
%%time
#min_count is the minimal nuimber of times that a single bigram has to appear in order to be considered a real bigram
#threshold is linked to the probability of observing the words of the bigram together and the probability of 
#observing them separately
phrases = Phrases(text_sentences, min_count=10, threshold=20., common_terms=stop_words) #, scoring='npmi')
bigram = Phraser(phrases)

## Fitting the count vectorizer on the clean_text field

In [None]:
%%time
cv2 = feature_extraction.text.CountVectorizer(min_df=10, max_df=0.5, stop_words=stop_words, analyzer=phrase_analyzer)
#building the vocabulary with unique pieces of text (in other words dropping duplicates in the clean_text column)
cv2.fit(tw2_df.clean_text.drop_duplicates())

In [None]:
%%time
#here I just obtain the matrix of counts of all the tweets, but with the vocabulary built with the unique 
#pieces of text only
C2 = cv2.transform(tw2_df.clean_text)

In [None]:
del tw2_df

In [None]:
C2

In [None]:
#saving the raw count matrix C2 and the vocabulary

joblib.dump([C2,cv2], '/../data/counts_vocabulary_iii.joblib', compress=6)

# early-VAX

In [None]:
idx = np.where((sort_tweets_df['created_at'] >= nov_1st_20) & (sort_tweets_df['created_at'] < apr_17_21))

tw3_df = sort_tweets_df.loc[idx]

In [None]:
tw3_df.count()

## Text cleaning

In [None]:
%%time
#cleaning the text and adding a 'clean_text' field
tw3_df['clean_text'] = tw3_df['master_text'].apply(lambda txt: cleantext(txt))

In [None]:
tw3_df.head()

In [None]:
len(tw3_df), len(tw3_df.drop_duplicates(subset=['master_text']))

## Phraser
In this section we add bigrams to the vocabulary: namely couple of words which often go together (i.e. "green_pass" or "new_york" etc...).

In [None]:
%%time
token_pattern = re.compile(r'(?u)\b[A-Za-z]\w+\b')
text_sentences = []
#building the dictionary with unique pieces of text (in other words dropping duplicated on the clean_text column)
for doc in tw3_df.clean_text.drop_duplicates():
    text_sentences.extend([token_pattern.findall(sent.lower()) for sent in doc.split('\n') if len(sent) > 0])

In [None]:
%%time
#min_count is the minimal nuimber of times that a single bigram has to appear in order to be considered a real bigram
#threshold is linked to the probability of observing the words of the bigram together and the probability of 
#observing them separately
phrases = Phrases(text_sentences, min_count=10, threshold=20., common_terms=stop_words) #, scoring='npmi')
bigram = Phraser(phrases)

## Fitting the count vectorizer on the clean_text field

In [None]:
%%time
cv2 = feature_extraction.text.CountVectorizer(min_df=10, max_df=0.5, stop_words=stop_words, analyzer=phrase_analyzer)
#building the vocabulary with unique pieces of text (in other words dropping duplicates in the clean_text column)
cv2.fit(tw3_df.clean_text.drop_duplicates())

In [None]:
%%time
#here I just obtain the matrix of counts of all the tweets, but with the vocabulary built with the unique 
#pieces of text only
C2 = cv2.transform(tw3_df.clean_text)

In [None]:
del tw3_df

In [None]:
C2

In [None]:
#saving the raw count matrix C2 and the vocabulary

joblib.dump([C2,cv2], '/../data/counts_vocabulary_iv.joblib', compress=6)

# VAX-drive

In [None]:
idx = np.where((sort_tweets_df['created_at'] >= apr_17_21) & (sort_tweets_df['created_at'] < aug_1st_21))

tw4_df = sort_tweets_df.loc[idx]

In [None]:
tw4_df.count()

## Text cleaning

In [None]:
%%time
#cleaning the text and adding a 'clean_text' field
tw4_df['clean_text'] = tw4_df['master_text'].apply(lambda txt: cleantext(txt))

In [None]:
tw4_df.head()

In [None]:
len(tw4_df), len(tw4_df.drop_duplicates(subset=['master_text']))

## Phraser
In this section we add bigrams to the vocabulary: namely couple of words which often go together (i.e. "green_pass" or "new_york" etc...).

In [None]:
%%time
token_pattern = re.compile(r'(?u)\b[A-Za-z]\w+\b')
text_sentences = []
#building the dictionary with unique pieces of text (in other words dropping duplicated on the clean_text column)
for doc in tw4_df.clean_text.drop_duplicates():
    text_sentences.extend([token_pattern.findall(sent.lower()) for sent in doc.split('\n') if len(sent) > 0])

In [None]:
%%time
#min_count is the minimal nuimber of times that a single bigram has to appear in order to be considered a real bigram
#threshold is linked to the probability of observing the words of the bigram together and the probability of 
#observing them separately
phrases = Phrases(text_sentences, min_count=10, threshold=20., common_terms=stop_words) #, scoring='npmi')
bigram = Phraser(phrases)

## Fitting the count vectorizer on the clean_text field

In [None]:
%%time
cv2 = feature_extraction.text.CountVectorizer(min_df=10, max_df=0.5, stop_words=stop_words, analyzer=phrase_analyzer)
#building the vocabulary with unique pieces of text (in other words dropping duplicates in the clean_text column)
cv2.fit(tw4_df.clean_text.drop_duplicates())

In [None]:
%%time
#here I just obtain the matrix of counts of all the tweets, but with the vocabulary built with the unique 
#pieces of text only
C2 = cv2.transform(tw4_df.clean_text)

In [None]:
del tw4_df

In [None]:
C2

In [None]:
#saving the raw count matrix C2 and the vocabulary

joblib.dump([C2,cv2], '/../data/counts_vocabulary_v.joblib', compress=6)

# late-VAX

In [None]:
tw5_df = sort_tweets_df[sort_tweets_df['created_at'] >= aug_1st_21]

In [None]:
tw5_df.count()

## Text cleaning

In [None]:
%%time
#cleaning the text and adding a 'clean_text' field
tw5_df['clean_text'] = tw5_df['master_text'].apply(lambda txt: cleantext(txt))

In [None]:
tw5_df.head()

In [None]:
len(tw5_df), len(tw5_df.drop_duplicates(subset=['master_text']))

## Phraser
In this section we add bigrams to the vocabulary: namely couple of words which often go together (i.e. "green_pass" or "new_york" etc...).

In [None]:
%%time
token_pattern = re.compile(r'(?u)\b[A-Za-z]\w+\b')
text_sentences = []
#building the dictionary with unique pieces of text (in other words dropping duplicated on the clean_text column)
for doc in tw5_df.clean_text.drop_duplicates():
    text_sentences.extend([token_pattern.findall(sent.lower()) for sent in doc.split('\n') if len(sent) > 0])

In [None]:
%%time
#min_count is the minimal nuimber of times that a single bigram has to appear in order to be considered a real bigram
#threshold is linked to the probability of observing the words of the bigram together and the probability of 
#observing them separately
phrases = Phrases(text_sentences, min_count=10, threshold=20., common_terms=stop_words) #, scoring='npmi')
bigram = Phraser(phrases)

## Fitting the count vectorizer on the clean_text field

In [None]:
%%time
cv2 = feature_extraction.text.CountVectorizer(min_df=10, max_df=0.5, stop_words=stop_words, analyzer=phrase_analyzer)
#building the vocabulary with unique pieces of text (in other words dropping duplicates in the clean_text column)
cv2.fit(tw5_df.clean_text.drop_duplicates())

In [None]:
%%time
#here I just obtain the matrix of counts of all the tweets, but with the vocabulary built with the unique 
#pieces of text only
C2 = cv2.transform(tw5_df.clean_text)

In [None]:
del tw5_df

In [None]:
C2

In [None]:
#saving the raw count matrix C2 and the vocabulary

joblib.dump([C2,cv2], '/../data/counts_vocabulary_vi.joblib', compress=6)