# Preliminary Topic Detection General 
## In this notebook:
* We import and manipulate the tweets dataframe by adding the master text and sorting tweets by date;
* We clean the text and add the respective field to the dataframe (we do not remove hashtags and mentions);
* For each time window:
    * We build the vocabulary by fitting the count vectorizer on the clean_text field (WITHOUT TEXT DUPLICATES). We also include bigrams in the vocabulary;
    * We obtain the Tweet-Term-Matrix (C2) with all the tweets (not just the ones with unique piece of text) and we save it to file;
    * We save the count vectorizer (vocabulary) to file;

In [1]:
import pandas as pd
import numpy as np
from IPython.display import clear_output
import random
import scipy.sparse
from scipy.sparse import hstack, coo_matrix, vstack
from sklearn import feature_extraction
import joblib

In [2]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
import regex as re
import nltk
from nltk.corpus import stopwords

In [3]:
from gensim.models.phrases import Phraser, Phrases

In [7]:
%%time
#import the tweets_df

tweets_df = pd.read_csv('/home/gcrupi/6_time_windows/github/tweets_example.csv').drop(['Unnamed: 0'],axis=1)

CPU times: user 0 ns, sys: 36.7 ms, total: 36.7 ms
Wall time: 581 ms


In [8]:
tweets_df.count()

id             100
id_usr         100
id_usr_rt       55
created_at     100
master_text    100
dtype: int64

### Transforming 'created_at' fields into datetime objects and sorting tweets by date

In [10]:
#to_datetime transforms strings containing dates into datetime objects. to_datetime returns a pd Series with indices
#the same indices of the rt_id_df and with values datetime objects
#So I first get two pd Series containing infos of 'created_at' and 'created_at_rt' fields
cr_at_series = pd.to_datetime(tweets_df['created_at'], format = '%a %b %d %H:%M:%S +0000 %Y')

#then I turn the two series into two temporary dataframes.
temp_df1 = cr_at_series.to_frame()
temp_df1.columns = ['created_at_datetime']

#substituting the old string-form fields with new datetime-form fields
tweets_df = tweets_df.drop(['created_at'],axis=1)

tweets_df.insert(loc=2, column='created_at',value=temp_df1['created_at_datetime'],allow_duplicates=True)

del temp_df1

In [11]:
sort_tweets_df = tweets_df.sort_values(by=['created_at'])

In [12]:
del tweets_df

In [13]:
#defining the splitting dates
sep_5th_19 = pd.Timestamp(2019,9,5)
jan_1st_20 = pd.Timestamp(2020,1,1)
mar_9th_20 = pd.Timestamp(2020,3,9)
nov_1st_20 = pd.Timestamp(2020,11,1)
apr_17_21 = pd.Timestamp(2021,4,17)
aug_1st_21 = pd.Timestamp(2021,8,1)
nov_8th_21 = pd.Timestamp(2021,11,8)

In [14]:
len(sort_tweets_df)

100

In [15]:
#removing self loops
self_index = sort_tweets_df[sort_tweets_df['id_usr']==sort_tweets_df['id_usr_rt']].index
sort_tweets_df = sort_tweets_df.drop(self_index)

In [16]:
len(sort_tweets_df)

99

In [14]:
#changing the indices of the sorted df so that they go from 0 to len(sort_tweets_df)-1
ni = np.arange(len(sort_tweets_df)) #new indices     
s = pd.Series(ni) #I turn my 'new indices' numpy array into a pandas series
sort_tweets_df = sort_tweets_df.set_index([s]) #and use this series to change the indices of the dataframe 

In [15]:
#removing the 'id_usr_rt' field
sort_tweets_df = sort_tweets_df.drop(['id_usr_rt'],axis=1)

## Defining stopwords

In [19]:
stopwords.words('italian') #'forse', 'qualche', 'qualcosa', 'chissà', 'po', 'stata', 'fatta', 'fatto', 'alcuni', 
#'quasi', 'oltre', 'fate', 'to', 'farne', 'far', 'ecco', 'però', 'sì', 'circa', 'state', 'ok', 'magari', 'so', 
#'ieri', 'oggi', 'stare', 'perchè', 'eh', 'ah', 'vabbè', 'ce', 'fra', 'proprio', 'te', 'pensa', 'vuoi', 'sai', 
#'puoi', 'devi', 'vai', 'fatti', 'guarda', 'dico', 'sa', 'sti', 'allora', 'tutte','altre', 'comunque', 'avere', 'deve'

['ad',
 'al',
 'allo',
 'ai',
 'agli',
 'all',
 'agl',
 'alla',
 'alle',
 'con',
 'col',
 'coi',
 'da',
 'dal',
 'dallo',
 'dai',
 'dagli',
 'dall',
 'dagl',
 'dalla',
 'dalle',
 'di',
 'del',
 'dello',
 'dei',
 'degli',
 'dell',
 'degl',
 'della',
 'delle',
 'in',
 'nel',
 'nello',
 'nei',
 'negli',
 'nell',
 'negl',
 'nella',
 'nelle',
 'su',
 'sul',
 'sullo',
 'sui',
 'sugli',
 'sull',
 'sugl',
 'sulla',
 'sulle',
 'per',
 'tra',
 'contro',
 'io',
 'tu',
 'lui',
 'lei',
 'noi',
 'voi',
 'loro',
 'mio',
 'mia',
 'miei',
 'mie',
 'tuo',
 'tua',
 'tuoi',
 'tue',
 'suo',
 'sua',
 'suoi',
 'sue',
 'nostro',
 'nostra',
 'nostri',
 'nostre',
 'vostro',
 'vostra',
 'vostri',
 'vostre',
 'mi',
 'ti',
 'ci',
 'vi',
 'lo',
 'la',
 'li',
 'le',
 'gli',
 'ne',
 'il',
 'un',
 'uno',
 'una',
 'ma',
 'ed',
 'se',
 'perché',
 'anche',
 'come',
 'dov',
 'dove',
 'che',
 'chi',
 'cui',
 'non',
 'più',
 'quale',
 'quanto',
 'quanti',
 'quanta',
 'quante',
 'quello',
 'quelli',
 'quella',
 'quelle',
 'q

In [20]:
stop = stopwords.words('italian')
en_stop = stopwords.words('english')
query = ['vacc', 'vaccinale', 'vaccinali', 'vaccinano', 'vaccinarci', 'vaccinare', 'vaccinarsi',
               'vaccinate', 'vaccinati', 'vaccinato', 'vaccinaz', 'vaccinazione', 'vaccinazioni', 'vaccines','vax','vaccine',
               'vaccini', 'vaccinista', 'vaccinisti', 'vaccino', 'antivaccinisti', 'freevax', 'iovaccino', 
               'nonvaccinato', 'novax', 'obbligovaccinale', 'provax', 'ridacciilvaccino','vaccine']

re_url = re.compile(r'https?:\/\/.*[\r\n]*', flags=re.U)
#re_rtw = re.compile(r'RT', flags=re.U)
re_htg = re.compile(r'#', flags=re.U) # remove hashtag sign
#re_htg = re.compile(r'#[\w]+ ?', flags=re.U)   # remove hashtags
re_hnd = re.compile(r'@', flags=re.U)
#re_hnd = re.compile(r'@\w+ ?', flags=re.U)
re_wrd = re.compile(r'[^\w]+ ', flags=re.U)
re_num = re.compile(r'[0-9]+', flags=re.U)

def cleantext(txt):
    t = txt
    t = re_url.sub('', t)
    #t = re_htg.sub('', t)
    #t = re_rtw.sub(' ', t)
    t = re_hnd.sub(' ', t)
    t = re_wrd.sub(' ', t)
    t = re_num.sub(' ', t)
    return t.strip().lower()

In [21]:
#defining stop words
more_stop = ['già','poi','solo','no','fa','può','quindi','quando','x','ogni','altro','così','mai','tutta','ancora',
            'ora', 'molto','d', 'via','sempre','rt','co','https','dopo','fare','fatto','italia','essere','cosa',
            'oggi','bene','dire','dice','vuole','vaccinati','vaccino','vaccini','vaccinato','senza','altri','me',
             'detto','meno','invece','va','grazie']
            
more_more = ['forse', 'qualche', 'qualcosa', 'chissà', 'po', 'stata', 'fatta', 'fatto', 'alcuni', 
            'quasi', 'oltre', 'fate', 'to', 'farne', 'far', 'ecco', 'però', 'sì', 'circa', 'state', 'ok', 'magari', 'so', 
            'ieri', 'oggi', 'stare', 'perchè', 'eh', 'ah', 'vabbè', 'ce', 'fra', 'proprio', 'te', 'pensa', 'vuoi', 'sai', 
            'puoi', 'devi', 'vai', 'fatti', 'guarda', 'dico', 'sa', 'sti', 'allora', 'tutte','altre', 'comunque', 'avere', 'deve']

stop_words = set(stop+query+en_stop+more_stop+more_more)

# pre-COVID

In [16]:
tw0_df = sort_tweets_df[sort_tweets_df['created_at'] < jan_1st_20]

In [17]:
tw0_df.count()

id             204275
id_usr         204275
created_at     204275
master_text    204275
dtype: int64

## Text cleaning

In [22]:
%%time
#cleaning the text and adding a 'clean_text' field
tw0_df['clean_text'] = tw0_df['master_text'].apply(lambda txt: cleantext(txt))

CPU times: user 3.09 s, sys: 1.17 s, total: 4.26 s
Wall time: 4.67 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [23]:
tw0_df.head()

Unnamed: 0,id,id_usr,created_at,master_text,clean_text
0,1169560932198039552,939822944,2019-09-05 10:40:30,@carlosibilia @GiuseppeConteIT QUINDI SETTIMA...,carlosibilia giuseppeconteit quindi settimana ...
1,1169561004151382016,939822944,2019-09-05 10:40:47,@SkyTG24 POTRESTE CHIEDERE X CORTESIA SE SET...,skytg potreste chiedere x cortesia se settima...
2,1169561087475421184,939822944,2019-09-05 10:41:07,@F_Boccia @pdnetwork POTRESTE CHIEDERE X CORTE...,f_boccia pdnetwork potreste chiedere x cortesi...
3,1169561110388850688,80200885,2019-09-05 10:41:12,Nel 2018 votai 5 stelle per due motivi: 1) abo...,nel votai stelle per due motivi abolizio...
4,1169561453701083136,939822944,2019-09-05 10:42:34,@RepubblicaTv POTRESTE CHIEDERE X CORTESIA S...,repubblicatv potreste chiedere x cortesia se s...


In [24]:
len(tw0_df), len(tw0_df.drop_duplicates(subset=['master_text']))

(204275, 43001)

## Phraser
In this section we add bigrams to the vocabulary: namely couple of words which often go together (i.e. "green_pass" or "new_york" etc...).

In [32]:
def phrase_analyzer(text):
    words = [w for w in token_pattern.findall(text.lower()) if w not in stop_words]
    return bigram[words]

In [30]:
%%time
token_pattern = re.compile(r'(?u)\b[A-Za-z]\w+\b')
text_sentences = []
#building the dictionary with unique pieces of text (in other words dropping duplicated on the clean_text column)
for doc in tw0_df.clean_text.drop_duplicates():
    text_sentences.extend([token_pattern.findall(sent.lower()) for sent in doc.split('\n') if len(sent) > 0])

CPU times: user 744 ms, sys: 116 ms, total: 861 ms
Wall time: 859 ms


In [31]:
%%time
#min_count is the minimal nuimber of times that a single bigram has to appear in order to be considered a real bigram
#threshold is linked to the probability of observing the words of the bigram together and the probability of 
#observing them separately
phrases = Phrases(text_sentences, min_count=10, threshold=20., common_terms=stop_words) #, scoring='npmi')
bigram = Phraser(phrases)

CPU times: user 7.02 s, sys: 11.5 ms, total: 7.04 s
Wall time: 7.03 s


## Fitting the count vectorizer on the clean_text field

In [33]:
%%time
cv2 = feature_extraction.text.CountVectorizer(min_df=10, max_df=0.5, stop_words=stop_words, analyzer=phrase_analyzer)
#building the vocabulary with unique pieces of text (in other words dropping duplicates in the clean_text column)
cv2.fit(tw0_df.clean_text.drop_duplicates())



CPU times: user 2.97 s, sys: 99.6 ms, total: 3.07 s
Wall time: 3.27 s


CountVectorizer(analyzer=<function phrase_analyzer at 0x7f3f30319550>,
                max_df=0.5, min_df=10,
                stop_words={'a', 'abbia', 'abbiamo', 'abbiano', 'abbiate',
                            'about', 'above', 'ad', 'after', 'again', 'against',
                            'agl', 'agli', 'ah', 'ai', 'ain', 'al', 'alcuni',
                            'all', 'alla', 'alle', 'allo', 'allora', 'altre',
                            'altri', 'altro', 'am', 'an', 'anche', 'ancora', ...})

In [34]:
%%time
#here I just obtain the matrix of counts of all the tweets, but with the vocabulary built with the unique 
#pieces of text only
C2 = cv2.transform(tw0_df.clean_text)

CPU times: user 16.1 s, sys: 592 ms, total: 16.7 s
Wall time: 16.7 s


In [35]:
del tw0_df

In [36]:
C2

<204275x7458 sparse matrix of type '<class 'numpy.int64'>'
	with 2399096 stored elements in Compressed Sparse Row format>

In [37]:
#saving the raw count matrix C2 and the vocabulary

joblib.dump([C2,cv2], '/../data/counts_vocabulary_i.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/counts_vocabulary_preCOVID.joblib']

# early-COVID

In [38]:
idx = np.where((sort_tweets_df['created_at'] >= jan_1st_20) & (sort_tweets_df['created_at'] < mar_9th_20))

tw1_df = sort_tweets_df.loc[idx]

In [41]:
tw1_df.count()

id             125887
id_usr         125887
created_at     125887
master_text    125887
dtype: int64

## Text cleaning

In [42]:
%%time
#cleaning the text and adding a 'clean_text' field
tw1_df['clean_text'] = tw1_df['master_text'].apply(lambda txt: cleantext(txt))

CPU times: user 1.96 s, sys: 724 ms, total: 2.68 s
Wall time: 2.68 s


In [43]:
tw1_df.head()

Unnamed: 0,id,id_usr,created_at,master_text,clean_text
204275,1212161556798152706,1045707647440302086,2020-01-01 00:00:10,@RobertoBurioni 🧐non riesce ad ammettere i rea...,robertoburioni 🧐non riesce ad ammettere i real...
204276,1212162656574066688,4160203479,2020-01-01 00:04:32,Quando ho iniziato a dissentire sull'obbligo v...,quando ho iniziato a dissentire sull'obbligo v...
204277,1212162722198151168,1087376422140817409,2020-01-01 00:04:48,❌❌❤️❤️💔💔 I CUCCIOLI IN CANILE NON DOVREBBERO M...,❌❌❤️❤️ i cuccioli in canile non dovrebbero mai...
204278,1212162901433339904,884873425,2020-01-01 00:05:31,"Il Messaggero: Alzheimer e demenza, vaccino vi...",il messaggero alzheimer e demenza vaccino vici...
204279,1212164483432493056,700901262,2020-01-01 00:11:48,@nopost3b @osvaldoluci @intuslegens Ma i vacci...,nopost b osvaldoluci intuslegens ma i vaccini ...


In [44]:
len(tw1_df), len(tw1_df.drop_duplicates(subset=['master_text']))

(125887, 35181)

## Phraser
In this section we add bigrams to the vocabulary: namely couple of words which often go together (i.e. "green_pass" or "new_york" etc...).

In [45]:
%%time
token_pattern = re.compile(r'(?u)\b[A-Za-z]\w+\b')
text_sentences = []
#building the dictionary with unique pieces of text (in other words dropping duplicated on the clean_text column)
for doc in tw1_df.clean_text.drop_duplicates():
    text_sentences.extend([token_pattern.findall(sent.lower()) for sent in doc.split('\n') if len(sent) > 0])

CPU times: user 775 ms, sys: 91.6 ms, total: 867 ms
Wall time: 865 ms


In [46]:
%%time
#min_count is the minimal nuimber of times that a single bigram has to appear in order to be considered a real bigram
#threshold is linked to the probability of observing the words of the bigram together and the probability of 
#observing them separately
phrases = Phrases(text_sentences, min_count=10, threshold=20., common_terms=stop_words) #, scoring='npmi')
bigram = Phraser(phrases)

CPU times: user 6.06 s, sys: 7.14 ms, total: 6.06 s
Wall time: 6.06 s


## Fitting the count vectorizer on the clean_text field

In [47]:
%%time
cv2 = feature_extraction.text.CountVectorizer(min_df=10, max_df=0.5, stop_words=stop_words, analyzer=phrase_analyzer)
#building the vocabulary with unique pieces of text (in other words dropping duplicates in the clean_text column)
cv2.fit(tw1_df.clean_text.drop_duplicates())



CPU times: user 2.43 s, sys: 88.1 ms, total: 2.52 s
Wall time: 2.52 s


CountVectorizer(analyzer=<function phrase_analyzer at 0x7f3f30319550>,
                max_df=0.5, min_df=10,
                stop_words={'a', 'abbia', 'abbiamo', 'abbiano', 'abbiate',
                            'about', 'above', 'ad', 'after', 'again', 'against',
                            'agl', 'agli', 'ah', 'ai', 'ain', 'al', 'alcuni',
                            'all', 'alla', 'alle', 'allo', 'allora', 'altre',
                            'altri', 'altro', 'am', 'an', 'anche', 'ancora', ...})

In [48]:
%%time
#here I just obtain the matrix of counts of all the tweets, but with the vocabulary built with the unique 
#pieces of text only
C2 = cv2.transform(tw1_df.clean_text)

CPU times: user 9.73 s, sys: 291 ms, total: 10 s
Wall time: 10 s


In [49]:
del tw1_df

In [50]:
C2

<125887x6171 sparse matrix of type '<class 'numpy.int64'>'
	with 1332630 stored elements in Compressed Sparse Row format>

In [37]:
#saving the raw count matrix C2 and the vocabulary

joblib.dump([C2,cv2], '/../data/counts_vocabulary_ii.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/counts_vocabulary_preCOVID.joblib']

# pre-VAX

In [52]:
idx = np.where((sort_tweets_df['created_at'] >= mar_9th_20) & (sort_tweets_df['created_at'] < nov_1st_20))

tw2_df = sort_tweets_df.loc[idx]

In [53]:
tw2_df.count()

id             1036177
id_usr         1036177
created_at     1036177
master_text    1036177
dtype: int64

## Text cleaning

In [54]:
%%time
#cleaning the text and adding a 'clean_text' field
tw2_df['clean_text'] = tw2_df['master_text'].apply(lambda txt: cleantext(txt))

CPU times: user 16 s, sys: 5.71 s, total: 21.7 s
Wall time: 21.7 s


In [55]:
tw2_df.head()

Unnamed: 0,id,id_usr,created_at,master_text,clean_text
330162,1236804065906110465,128242325,2020-03-09 00:00:42,Sarebbe interessante sapere se quelli deceduti...,sarebbe interessante sapere se quelli deceduti...
330163,1236804086521106434,1004943776199028738,2020-03-09 00:00:47,I medici cinesi lavorano a pieno ritmo per un ...,i medici cinesi lavorano a pieno ritmo per un ...
330164,1236804171799633926,955107625393418241,2020-03-09 00:01:07,Italia. Il Paese il cui Premier ha un comunica...,italia il paese il cui premier ha un comunicat...
330165,1236804401148432384,1093535050757476355,2020-03-09 00:02:02,@MinervaMcGrani1 Vaccini non ne faccio assolut...,minervamcgrani vaccini non ne faccio assoluta...
330166,1236804433595465729,3010677376,2020-03-09 00:02:10,Scoppia un’epidemia nazionale quando al govern...,scoppia un’epidemia nazionale quando al govern...


In [56]:
len(tw2_df), len(tw2_df.drop_duplicates(subset=['master_text']))

(1036177, 349401)

## Phraser
In this section we add bigrams to the vocabulary: namely couple of words which often go together (i.e. "green_pass" or "new_york" etc...).

In [57]:
%%time
token_pattern = re.compile(r'(?u)\b[A-Za-z]\w+\b')
text_sentences = []
#building the dictionary with unique pieces of text (in other words dropping duplicated on the clean_text column)
for doc in tw2_df.clean_text.drop_duplicates():
    text_sentences.extend([token_pattern.findall(sent.lower()) for sent in doc.split('\n') if len(sent) > 0])

CPU times: user 6.29 s, sys: 812 ms, total: 7.1 s
Wall time: 7.1 s


In [58]:
%%time
#min_count is the minimal nuimber of times that a single bigram has to appear in order to be considered a real bigram
#threshold is linked to the probability of observing the words of the bigram together and the probability of 
#observing them separately
phrases = Phrases(text_sentences, min_count=10, threshold=20., common_terms=stop_words) #, scoring='npmi')
bigram = Phraser(phrases)

CPU times: user 50.1 s, sys: 31.6 ms, total: 50.1 s
Wall time: 50.1 s


## Fitting the count vectorizer on the clean_text field

In [59]:
%%time
cv2 = feature_extraction.text.CountVectorizer(min_df=10, max_df=0.5, stop_words=stop_words, analyzer=phrase_analyzer)
#building the vocabulary with unique pieces of text (in other words dropping duplicates in the clean_text column)
cv2.fit(tw2_df.clean_text.drop_duplicates())



CPU times: user 22.3 s, sys: 791 ms, total: 23.1 s
Wall time: 23.1 s


CountVectorizer(analyzer=<function phrase_analyzer at 0x7f3f30319550>,
                max_df=0.5, min_df=10,
                stop_words={'a', 'abbia', 'abbiamo', 'abbiano', 'abbiate',
                            'about', 'above', 'ad', 'after', 'again', 'against',
                            'agl', 'agli', 'ah', 'ai', 'ain', 'al', 'alcuni',
                            'all', 'alla', 'alle', 'allo', 'allora', 'altre',
                            'altri', 'altro', 'am', 'an', 'anche', 'ancora', ...})

In [60]:
%%time
#here I just obtain the matrix of counts of all the tweets, but with the vocabulary built with the unique 
#pieces of text only
C2 = cv2.transform(tw2_df.clean_text)

CPU times: user 1min 17s, sys: 2.71 s, total: 1min 19s
Wall time: 1min 19s


In [61]:
del tw2_df

In [62]:
C2

<1036177x32842 sparse matrix of type '<class 'numpy.int64'>'
	with 12215954 stored elements in Compressed Sparse Row format>

In [37]:
#saving the raw count matrix C2 and the vocabulary

joblib.dump([C2,cv2], '/../data/counts_vocabulary_iii.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/counts_vocabulary_preCOVID.joblib']

# early-VAX

In [64]:
idx = np.where((sort_tweets_df['created_at'] >= nov_1st_20) & (sort_tweets_df['created_at'] < apr_17_21))

tw3_df = sort_tweets_df.loc[idx]

In [65]:
tw3_df.count()

id             5137559
id_usr         5137559
created_at     5137559
master_text    5137559
dtype: int64

## Text cleaning

In [66]:
%%time
#cleaning the text and adding a 'clean_text' field
tw3_df['clean_text'] = tw3_df['master_text'].apply(lambda txt: cleantext(txt))

CPU times: user 1min 13s, sys: 26.9 s, total: 1min 40s
Wall time: 1min 40s


In [67]:
tw3_df.head()

Unnamed: 0,id,id_usr,created_at,master_text,clean_text
1366339,1322689824499916801,28414121,2020-11-01 00:00:03,Non ho detto di no a un cocktail fluorescente ...,non ho detto di no a un cocktail fluorescente ...
1366340,1322689942624063491,1088147570961068034,2020-11-01 00:00:31,VALIGIA E CORREDINO PRONTI MA LA FAMIGLIA HA R...,valigia e corredino pronti ma la famiglia ha r...
1366341,1322690040020013056,805913988240474115,2020-11-01 00:00:54,Provincia di Lecce\n\nQuesto cucciolone di 1 a...,provincia di lecce\n\nquesto cucciolone di a...
1366342,1322690740267425792,790152050948575233,2020-11-01 00:03:41,@Clubchakama @MDragonil @MassimoGalli51 Quello...,clubchakama mdragonil massimogalli quello che...
1366343,1322691045549813763,121443333,2020-11-01 00:04:54,"Urbano #Cairo, proprietario de #la7, positivo ...",urbano #cairo proprietario de #la positivo al...


In [68]:
len(tw3_df), len(tw3_df.drop_duplicates(subset=['master_text']))

(5137559, 1868058)

## Phraser
In this section we add bigrams to the vocabulary: namely couple of words which often go together (i.e. "green_pass" or "new_york" etc...).

In [69]:
%%time
token_pattern = re.compile(r'(?u)\b[A-Za-z]\w+\b')
text_sentences = []
#building the dictionary with unique pieces of text (in other words dropping duplicated on the clean_text column)
for doc in tw3_df.clean_text.drop_duplicates():
    text_sentences.extend([token_pattern.findall(sent.lower()) for sent in doc.split('\n') if len(sent) > 0])

CPU times: user 36.2 s, sys: 4.46 s, total: 40.6 s
Wall time: 40.6 s


In [70]:
%%time
#min_count is the minimal nuimber of times that a single bigram has to appear in order to be considered a real bigram
#threshold is linked to the probability of observing the words of the bigram together and the probability of 
#observing them separately
phrases = Phrases(text_sentences, min_count=10, threshold=20., common_terms=stop_words) #, scoring='npmi')
bigram = Phraser(phrases)

CPU times: user 4min 1s, sys: 812 ms, total: 4min 1s
Wall time: 4min 1s


## Fitting the count vectorizer on the clean_text field

In [71]:
%%time
cv2 = feature_extraction.text.CountVectorizer(min_df=10, max_df=0.5, stop_words=stop_words, analyzer=phrase_analyzer)
#building the vocabulary with unique pieces of text (in other words dropping duplicates in the clean_text column)
cv2.fit(tw3_df.clean_text.drop_duplicates())



CPU times: user 1min 59s, sys: 4.85 s, total: 2min 4s
Wall time: 2min 4s


CountVectorizer(analyzer=<function phrase_analyzer at 0x7f3f30319550>,
                max_df=0.5, min_df=10,
                stop_words={'a', 'abbia', 'abbiamo', 'abbiano', 'abbiate',
                            'about', 'above', 'ad', 'after', 'again', 'against',
                            'agl', 'agli', 'ah', 'ai', 'ain', 'al', 'alcuni',
                            'all', 'alla', 'alle', 'allo', 'allora', 'altre',
                            'altri', 'altro', 'am', 'an', 'anche', 'ancora', ...})

In [72]:
%%time
#here I just obtain the matrix of counts of all the tweets, but with the vocabulary built with the unique 
#pieces of text only
C2 = cv2.transform(tw3_df.clean_text)

CPU times: user 6min 14s, sys: 13.4 s, total: 6min 27s
Wall time: 6min 27s


In [73]:
del tw3_df

In [74]:
C2

<5137559x95428 sparse matrix of type '<class 'numpy.int64'>'
	with 57800490 stored elements in Compressed Sparse Row format>

In [37]:
#saving the raw count matrix C2 and the vocabulary

joblib.dump([C2,cv2], '/../data/counts_vocabulary_iv.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/counts_vocabulary_preCOVID.joblib']

# VAX-drive

In [76]:
idx = np.where((sort_tweets_df['created_at'] >= apr_17_21) & (sort_tweets_df['created_at'] < aug_1st_21))

tw4_df = sort_tweets_df.loc[idx]

In [77]:
tw4_df.count()

id             4160533
id_usr         4160533
created_at     4160533
master_text    4160533
dtype: int64

## Text cleaning

In [78]:
%%time
#cleaning the text and adding a 'clean_text' field
tw4_df['clean_text'] = tw4_df['master_text'].apply(lambda txt: cleantext(txt))

CPU times: user 59.9 s, sys: 25.7 s, total: 1min 25s
Wall time: 1min 26s


In [79]:
tw4_df.head()

Unnamed: 0,id,id_usr,created_at,master_text,clean_text
6503898,1383208730687311878,884460743453798401,2021-04-17 00:00:34,"Caro @demagistris, nella scuola di mia figlia,...",caro demagistris nella scuola di mia figlia ie...
6503899,1383208742301360134,2175887381,2021-04-17 00:00:37,"Lorenzo Costa, consigliere comunale a Catanzar...",lorenzo costa consigliere comunale a catanzaro...
6503900,1383208810320367625,1241210117657432064,2021-04-17 00:00:53,@luigidimaio Attenzione perche i vaccinati son...,luigidimaio attenzione perche i vaccinati sono...
6503901,1383208820718120960,1024702799215509504,2021-04-17 00:00:56,Se non ve ne foste accorti\nnessuno dice che i...,se non ve ne foste accorti\nnessuno dice che i...
6503902,1383208847054098432,1699728120,2021-04-17 00:01:02,"Sputnik V, ""nessun caso di trombosi"". Scienzia...","sputnik v ""nessun caso di trombosi scienziati ..."


In [80]:
len(tw4_df), len(tw4_df.drop_duplicates(subset=['master_text']))

(4160533, 1507488)

## Phraser
In this section we add bigrams to the vocabulary: namely couple of words which often go together (i.e. "green_pass" or "new_york" etc...).

In [81]:
%%time
token_pattern = re.compile(r'(?u)\b[A-Za-z]\w+\b')
text_sentences = []
#building the dictionary with unique pieces of text (in other words dropping duplicated on the clean_text column)
for doc in tw4_df.clean_text.drop_duplicates():
    text_sentences.extend([token_pattern.findall(sent.lower()) for sent in doc.split('\n') if len(sent) > 0])

CPU times: user 34.6 s, sys: 4.18 s, total: 38.8 s
Wall time: 40 s


In [82]:
%%time
#min_count is the minimal nuimber of times that a single bigram has to appear in order to be considered a real bigram
#threshold is linked to the probability of observing the words of the bigram together and the probability of 
#observing them separately
phrases = Phrases(text_sentences, min_count=10, threshold=20., common_terms=stop_words) #, scoring='npmi')
bigram = Phraser(phrases)

CPU times: user 3min 27s, sys: 698 ms, total: 3min 27s
Wall time: 3min 27s


## Fitting the count vectorizer on the clean_text field

In [83]:
%%time
cv2 = feature_extraction.text.CountVectorizer(min_df=10, max_df=0.5, stop_words=stop_words, analyzer=phrase_analyzer)
#building the vocabulary with unique pieces of text (in other words dropping duplicates in the clean_text column)
cv2.fit(tw4_df.clean_text.drop_duplicates())



CPU times: user 1min 41s, sys: 4.07 s, total: 1min 45s
Wall time: 1min 45s


CountVectorizer(analyzer=<function phrase_analyzer at 0x7f3f30319550>,
                max_df=0.5, min_df=10,
                stop_words={'a', 'abbia', 'abbiamo', 'abbiano', 'abbiate',
                            'about', 'above', 'ad', 'after', 'again', 'against',
                            'agl', 'agli', 'ah', 'ai', 'ain', 'al', 'alcuni',
                            'all', 'alla', 'alle', 'allo', 'allora', 'altre',
                            'altri', 'altro', 'am', 'an', 'anche', 'ancora', ...})

In [84]:
%%time
#here I just obtain the matrix of counts of all the tweets, but with the vocabulary built with the unique 
#pieces of text only
C2 = cv2.transform(tw4_df.clean_text)

CPU times: user 5min 3s, sys: 11.1 s, total: 5min 14s
Wall time: 5min 14s


In [85]:
del tw4_df

In [86]:
C2

<4160533x85922 sparse matrix of type '<class 'numpy.int64'>'
	with 45843757 stored elements in Compressed Sparse Row format>

In [37]:
#saving the raw count matrix C2 and the vocabulary

joblib.dump([C2,cv2], '/../data/counts_vocabulary_v.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/counts_vocabulary_preCOVID.joblib']

# late-VAX

In [89]:
tw5_df = sort_tweets_df[sort_tweets_df['created_at'] >= aug_1st_21]

In [90]:
tw5_df.count()

id             5450244
id_usr         5450244
created_at     5450244
master_text    5450244
dtype: int64

## Text cleaning

In [91]:
%%time
#cleaning the text and adding a 'clean_text' field
tw5_df['clean_text'] = tw5_df['master_text'].apply(lambda txt: cleantext(txt))

CPU times: user 1min 22s, sys: 30.2 s, total: 1min 52s
Wall time: 1min 52s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [92]:
tw5_df.head()

Unnamed: 0,id,id_usr,created_at,master_text,clean_text
10664431,1421621702924988418,2691538351,2021-08-01 00:00:01,@FoxNews Vaccinate,foxnews vaccinate
10664432,1421621729185595400,632537304,2021-08-01 00:00:07,Sono passati 5 giorni dopo la prima dose del v...,sono passati giorni dopo la prima dose del v...
10664433,1421621731769393153,1252333704996888576,2021-08-01 00:00:07,@AlfioKrancic Sito che lavora per nato e milit...,alfiokrancic sito che lavora per nato e milita...
10664434,1421621744251592706,1214181278334865408,2021-08-01 00:00:10,@Leonard48598239 Sì tu la fai parlare e non ri...,leonard sì tu la fai parlare e non rispondere...
10664435,1421621745270902786,22602699,2021-08-01 00:00:11,"Dico a voi, #novax: guardate che anche la vita...",dico a voi #novax guardate che anche la vita è...


In [93]:
len(tw5_df), len(tw5_df.drop_duplicates(subset=['master_text']))

(5450244, 1731499)

## Phraser
In this section we add bigrams to the vocabulary: namely couple of words which often go together (i.e. "green_pass" or "new_york" etc...).

In [94]:
%%time
token_pattern = re.compile(r'(?u)\b[A-Za-z]\w+\b')
text_sentences = []
#building the dictionary with unique pieces of text (in other words dropping duplicated on the clean_text column)
for doc in tw5_df.clean_text.drop_duplicates():
    text_sentences.extend([token_pattern.findall(sent.lower()) for sent in doc.split('\n') if len(sent) > 0])

CPU times: user 36.3 s, sys: 4.99 s, total: 41.3 s
Wall time: 41.3 s


In [95]:
%%time
#min_count is the minimal nuimber of times that a single bigram has to appear in order to be considered a real bigram
#threshold is linked to the probability of observing the words of the bigram together and the probability of 
#observing them separately
phrases = Phrases(text_sentences, min_count=10, threshold=20., common_terms=stop_words) #, scoring='npmi')
bigram = Phraser(phrases)

CPU times: user 4min 12s, sys: 903 ms, total: 4min 13s
Wall time: 4min 13s


## Fitting the count vectorizer on the clean_text field

In [96]:
%%time
cv2 = feature_extraction.text.CountVectorizer(min_df=10, max_df=0.5, stop_words=stop_words, analyzer=phrase_analyzer)
#building the vocabulary with unique pieces of text (in other words dropping duplicates in the clean_text column)
cv2.fit(tw5_df.clean_text.drop_duplicates())



CPU times: user 2min 1s, sys: 4.5 s, total: 2min 6s
Wall time: 2min 6s


CountVectorizer(analyzer=<function phrase_analyzer at 0x7f3f30319550>,
                max_df=0.5, min_df=10,
                stop_words={'a', 'abbia', 'abbiamo', 'abbiano', 'abbiate',
                            'about', 'above', 'ad', 'after', 'again', 'against',
                            'agl', 'agli', 'ah', 'ai', 'ain', 'al', 'alcuni',
                            'all', 'alla', 'alle', 'allo', 'allora', 'altre',
                            'altri', 'altro', 'am', 'an', 'anche', 'ancora', ...})

In [97]:
%%time
#here I just obtain the matrix of counts of all the tweets, but with the vocabulary built with the unique 
#pieces of text only
C2 = cv2.transform(tw5_df.clean_text)

CPU times: user 6min 53s, sys: 17.9 s, total: 7min 11s
Wall time: 7min 12s


In [98]:
del tw5_df

In [99]:
C2

<5450244x100285 sparse matrix of type '<class 'numpy.int64'>'
	with 63817315 stored elements in Compressed Sparse Row format>

In [37]:
#saving the raw count matrix C2 and the vocabulary

joblib.dump([C2,cv2], '/../data/counts_vocabulary_vi.joblib', compress=6)

['/home/gcrupi/6_time_windows/sparse_matrices/top_model_timewindow/counts_vocabulary_preCOVID.joblib']