# GHS (General hesitant supporters) 
## In this notebook:
* We import the whole dataset and split it in 6 dataframes, one per time segment.
* We import the sets of supporters and hesitant users for every time window and obtain the indices of the tweets from supporters and hesitant indices.
* For every time window we run the following steps:
    * we import the raw count matrix $C$, the vocabulary $cv$ (of length $|V|$), the fitted $tfidf$ and the fitted $NMF$ (that contains $W$ and $H$ as well).
    * we transform $C$ with the $tfidf$ and obtain $X$.
    * we extract supporters and hesitant tweets (rows) to form $X_P$ and $X_A$ ($|P|$ and $|A|$ are the respective number of supporters and hesitant tweets).
    * we run topic discovery on both $X_P$ and $X_A$, to obtain $NMF_P$ and $NMF_A$ (that contain $W_P$, $H_P$ and $W_A$, $H_A$ as well).
    * we transform $X_P$ with $NMF_A$ to obtain $W^P_A$ and we transform $X_A$ with $NMF_P$ to obtain $W^A_P$. Note that $W^P_A$ has shape $|P| \times |V|$ and that $W^A_P$ has shape $|A| \times |V|$.
    * we define the hesitant strength of general topics as $S^A = W[hesitant].sumcols()$ and the supporters strength of general topics as $S^P = W[supporters].sumcols()$ and compute the hesitant share of **general** topics as $\frac{|P| \times S^A}{|P| \times S^A + |A| \times S^P}$.
    * we define the hesitant strength of hesitant topics as $S_A^A = W_A.sumcols()$ and the supporters strength of hesitant topics as $S_A^P = W_A^P.sumcols()$ and compute the hesitant share of **hesitant** topics as $\frac{|P| \times S_A^A}{|P| \times S_A^A + |A| \times S_A^P}$.
    * we define the hesitant strength of supporters topics as $S_P^A = W_P^A.sumcols()$ and the supporters strength of supporters topics as $S_P^P = W_P.sumcols()$ and compute the hesitant share of **supporters** topics as $\frac{|P| \times S_P^A}{|P| \times S_P^A + |A| \times S_P^P}$.
    * in the end we have 60 topics per time window: 20 general topics, 20 hesitant and 20 supporters topics.
    * we compute the strength of the topics like $NMF.transform(X).sumcols()$, $NMF_A.transform(X).sumcols()$, $NMF_P.transform(X).sumcols()$ where $X$ contains all the tweets of the time window.
    * we save all these results to file in dataframes like: **topic number, topic words, strength ,hesitant share**.
    
* **NB: vaccine hesitant group is represented by an "A", whereas vaccine supporters group is represented by an "P".**

In [None]:
import pandas as pd
import numpy as np
from IPython.display import clear_output
import random
import scipy.sparse
from scipy.sparse import hstack, coo_matrix, vstack
from sklearn import feature_extraction
import joblib
from sklearn import decomposition

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
import regex as re
import nltk
from nltk.corpus import stopwords

In [None]:
def phrase_analyzer(text):
    words = [w for w in token_pattern.findall(text.lower()) if w not in stop_words]
    return bigram[words]

In [None]:
%%time
#import the tweets_df
tweets_df = pd.read_csv('/../data/tweets_example.csv').drop(['Unnamed: 0'],axis=1)
tweets_df.head()

In [None]:
tweets_df = tweets_df[['id','id_usr','id_usr_rt','created_at']]

In [None]:
tweets_df.count()

## Transforming 'created_at' fields into datetime objects and sorting tweets by date

In [None]:
#to_datetime transforms strings containing dates into datetime objects. to_datetime returns a pd Series with indices
#the same indices of the rt_id_df and with values datetime objects
#So I first get two pd Series containing infos of 'created_at' and 'created_at_rt' fields
cr_at_series = pd.to_datetime(tweets_df['created_at'], format = '%a %b %d %H:%M:%S +0000 %Y')

#then I turn the two series into two temporary dataframes.
temp_df1 = cr_at_series.to_frame()
temp_df1.columns = ['created_at_datetime']

#substituting the old string-form fields with new datetime-form fields
tweets_df = tweets_df.drop(['created_at'],axis=1)

tweets_df.insert(loc=2, column='created_at',value=temp_df1['created_at_datetime'],allow_duplicates=True)

del temp_df1

In [None]:
sort_tweets_df = tweets_df.sort_values(by=['created_at'])

In [None]:
del tweets_df

In [None]:
len(sort_tweets_df)

In [None]:
#removing self loops
self_index = sort_tweets_df[sort_tweets_df['id_usr']==sort_tweets_df['id_usr_rt']].index
sort_tweets_df = sort_tweets_df.drop(self_index)

In [None]:
len(sort_tweets_df)

In [None]:
#changing the indices of the sorted df so that they go from 0 to len(sort_tweets_df)-1
ni = np.arange(len(sort_tweets_df)) #new indices     
s = pd.Series(ni) #I turn my 'new indices' numpy array into a pandas series
sort_tweets_df = sort_tweets_df.set_index([s]) #and use this series to change the indices of the dataframe 

In [None]:
#removing the 'id_usr_rt' field
sort_tweets_df = sort_tweets_df.drop(['id_usr_rt'],axis=1)

## Separating dataframes and importing NMFs

In [None]:
#defining the splitting dates
sep_5th_19 = pd.Timestamp(2019,9,5)
jan_1st_20 = pd.Timestamp(2020,1,1)
mar_9th_20 = pd.Timestamp(2020,3,9)
nov_1st_20 = pd.Timestamp(2020,11,1)
apr_17_21 = pd.Timestamp(2021,4,17)
aug_1st_21 = pd.Timestamp(2021,8,1)
nov_8th_21 = pd.Timestamp(2021,11,8)

In [None]:
tw0_df = sort_tweets_df[sort_tweets_df['created_at'] < jan_1st_20].reset_index()
tw5_df = sort_tweets_df[sort_tweets_df['created_at'] >= aug_1st_21].reset_index()

In [None]:
idx1 = np.where((sort_tweets_df['created_at'] >= jan_1st_20) & (sort_tweets_df['created_at'] < mar_9th_20))
idx2 = np.where((sort_tweets_df['created_at'] >= mar_9th_20) & (sort_tweets_df['created_at'] < nov_1st_20))
idx3 = np.where((sort_tweets_df['created_at'] >= nov_1st_20) & (sort_tweets_df['created_at'] < apr_17_21))
idx4 = np.where((sort_tweets_df['created_at'] >= apr_17_21) & (sort_tweets_df['created_at'] < aug_1st_21))

tw1_df = sort_tweets_df.loc[idx1].reset_index()
tw2_df = sort_tweets_df.loc[idx2].reset_index()
tw3_df = sort_tweets_df.loc[idx3].reset_index()
tw4_df = sort_tweets_df.loc[idx4].reset_index()

## Importing users dataframes

In [None]:
#n is a dataframe like: id_usr, RMC, community, time_window 
n = pd.read_csv('/../data/nodes_example_i_vi.csv',dtype=str).drop(['Unnamed: 0'],axis=1)

n0 = n[n['time_window'] == 'i']
n1 = n[n['time_window'] == 'ii']
n2 = n[n['time_window'] == 'iii']
n3 = n[n['time_window'] == 'iv']
n4 = n[n['time_window'] == 'v']
n5 = n[n['time_window'] == 'vi']

#vaccine hesitant community i: 1
#vaccine hesitant community ii: 3
#vaccine hesitant community iii: 1
#vaccine hesitant community iv: 1
#vaccine hesitant community v: 1
#vaccine hesitant community vi: 1

av0 = set(n0[n0['community'] == '1'].id_usr)
av1 = set(n1[n1['community'] == '3'].id_usr)
av2 = set(n2[n2['community'] == '1'].id_usr)
av3 = set(n3[n3['community'] == '1'].id_usr)
av4 = set(n4[n4['community'] == '1'].id_usr)
av5 = set(n5[n5['community'] == '1'].id_usr)


#vaccine supporters community i: 3
#vaccine supporters community ii: 1
#vaccine supporters community iii: 2
#vaccine supporters community iv: 2
#vaccine supporters community v: 2
#vaccine supporters community vi: 2


pv0 = set(n0[n0['community'] == '3'].id_usr)
pv1 = set(n1[n1['community'] == '1'].id_usr)
pv2 = set(n2[n2['community'] == '2'].id_usr)
pv3 = set(n3[n3['community'] == '2'].id_usr)
pv4 = set(n4[n4['community'] == '2'].id_usr)
pv5 = set(n5[n5['community'] == '2'].id_usr)

In [None]:
tw0_df = tw0_df.drop(['index'],axis=1).reset_index()
tw1_df = tw1_df.drop(['index'],axis=1).reset_index()
tw2_df = tw2_df.drop(['index'],axis=1).reset_index()
tw3_df = tw3_df.drop(['index'],axis=1).reset_index()
tw4_df = tw4_df.drop(['index'],axis=1).reset_index()
tw5_df = tw5_df.drop(['index'],axis=1).reset_index()

In [None]:
tw0_df.head(2)

In [None]:
tw5_df.tail(2)

In [None]:
#I want to store the indices of the tweets from supporters/hesitant users for every time window.
#I store them in supporters_/hesitant_idx

hesitant = [av0,av1,av2,av3,av4,av5]
supporters = [pv0,pv1,pv2,pv3,pv4,pv5]
dfs = [tw0_df,tw1_df,tw2_df,tw3_df,tw4_df,tw5_df]
hesitant_idx, supporters_idx = [], []

for av,pv,df in zip(hesitant,supporters,dfs):

    m_av = pd.merge(df, av, on='id_usr')
        
    m_pv = pd.merge(df, pv, on='id_usr')

    print('# hesitant tweets:',len(m_av),'# supporters tweets:',len(m_pv),'# total tweets:',len(df))

    hesitant_idx.append(m_av['index'])
    supporters_idx.append(m_pv['index'])

## Defining functions

In [None]:
def topic_share_df(cv, nmf, X, H, nr_topics, sa_norm):
    #sa_norm is the vector containing the hesitant share of the topics
    
    feature_names = np.array(cv.get_feature_names()) #vocabulary
    topic_strength = nmf.transform(X).sum(axis=0) #strength of the topics computed over the total dataset X
    
    topwords = np.array([])
    for i in range(nr_topics):
        topic_words = feature_names[np.argsort(H[i])[::-1][:10]]

        s = ' '.join(topic_words) #string containing the top 10 words of the topic

        topwords = np.append(topwords, s)
    
    #create a df like topic number, topic words, strength ,hesitant share
    df = pd.concat([pd.Series(np.arange(nr_topics)),pd.Series(topwords), pd.Series(topic_strength),pd.Series(sa_norm)],axis=1)
    df.columns = ['topic_index','topic_words','strength','hesitant_s']
    df['strength'] = df['strength'].apply(lambda x: float(x))
    return df.sort_values(by=['strength'],ascending=False)
    

#G: general
#H: hesitant 
#S: supporters 
def GHS(C0, cv0, tfidf0, W0, H0, nmf0, tw_idx, time_window, nr_topics = 20):
    
    X0 = tfidf0.transform(C0) #general tfidf matrix
    X0A = X0[hesitant_idx[tw_idx]] #tfidf matrix of the hesitant tweets only
    X0P = X0[supporters_idx[tw_idx]] #tfidf matrix of the supporters tweets only
    
    print('Running hesitant Topic Discovery...') #topic discovery on the hesitant tweets
    nmf0A = decomposition.NMF(nr_topics,
                            beta_loss='frobenius', solver='cd',
                            init='nndsvd', random_state=42)
    W0A = nmf0A.fit_transform(X0A)
    H0A = nmf0A.components_
    clear_output()

    print('Running supporters Topic Discovery...') #topic discovery on the supporters tweets
    nmf0P = decomposition.NMF(nr_topics,
                        beta_loss='frobenius', solver='cd',
                        init='nndsvd', random_state=42)
    W0P = nmf0P.fit_transform(X0P)
    H0P = nmf0P.components_
    clear_output()
    
    #cross W
    W0A_p = nmf0A.transform(X0P) #supporters matrix transformed with the hesitant fitted nmf
    W0P_a = nmf0P.transform(X0A) #hesitant matrix transformed with the supporters fitted nmf

    
    #ratio of nr_hesitant_tweets/nr_supporters_tweets
    #r can be smaller or larger than 1
    r_A = W0[hesitant_idx[tw_idx]].shape[0] 
    r_P = W0[supporters_idx[tw_idx]].shape[0] 
    
    #print(r_A, r_P)
    
    #####################################|G|####################################

    s0_a = W0[hesitant_idx[tw_idx]].sum(axis=0)*r_P #general strength of hesitant tweets 
    s0_p = W0[supporters_idx[tw_idx]].sum(axis=0)*r_A #general strength of supporters tweets
    norm = s0_a + s0_p #general normalization
    
    #print(s0_a/norm)


    df_G = topic_share_df(cv0, nmf0, X0, H0, nr_topics, s0_a/norm)
    
    
    #####################################|H|####################################
    s0A_a = W0A.sum(axis=0)*r_P #hesitant strength of hesitant tweets 
    s0A_p = W0A_p.sum(axis=0)*r_A #hesitant (normalized by r) strength of supporters tweets
    norm = s0A_p + s0A_a #hesitant normalization


    df_A = topic_share_df(cv0, nmf0A, X0, H0A, nr_topics, s0A_a/norm)
    
    
    #####################################|S|####################################
    s0P_a = W0P_a.sum(axis=0)*r_P #supporters strength of hesitant tweets 
    s0P_p = W0P.sum(axis=0)*r_A #supporters (normalized by r) strength of supporters tweets
    norm = s0P_a + s0P_p #supporters normalization

    
    df_P = topic_share_df(cv0, nmf0P, X0, H0P, nr_topics, s0P_a/norm)
    
    return df_G, df_A, df_P

## Importing NMFs

In [None]:
#importing the vocabularies and the NMFs
[C0, cv0] = joblib.load('/../data/counts_vocabulary_i.joblib')
[W0,H0,nmf0] = joblib.load('/../data/WHnmf_i.joblib')
tfidf0 = joblib.load('/../data/tfidf_i.joblib')

[C1, cv1] = joblib.load('/../data/counts_vocabulary_ii.joblib')
[W1,H1,nmf1] = joblib.load('/../data/WHnmf_ii.joblib')
tfidf1 = joblib.load('/../data/tfidf_ii.joblib')

[C2, cv2] = joblib.load('/../data/counts_vocabulary_iii.joblib')
[W2,H2,nmf2] = joblib.load('/../data/WHnmf_iii.joblib')
tfidf2 = joblib.load('/../data/tfidf_iii.joblib')

[C3, cv3] = joblib.load('/../data/counts_vocabulary_iv.joblib')
[W3,H3,nmf3] = joblib.load('/../data/WHnmf_iv.joblib')
tfidf3 = joblib.load('/../data/tfidf_iv.joblib')

[C4, cv4] = joblib.load('/../data/counts_vocabulary_v.joblib')
[W4,H4,nmf4] = joblib.load('/../data/WHnmf_v.joblib')
tfidf4 = joblib.load('/../data/tfidf_v.joblib')

[C5, cv5] = joblib.load('/../data/counts_vocabulary_vi.joblib')
[W5,H5,nmf5] = joblib.load('/../data/WHnmf_vi.joblib')
tfidf5 = joblib.load('/../data/tfidf_vi.joblib')

## preCOVID

In [None]:
%%time
g0,h0,s0 = GHS(C0, cv0, tfidf0, W0, H0, nmf0, tw_idx=0, time_window='preCOVID',save=False)

In [None]:
g0.to_csv('/../data/general_i.csv')
h0.to_csv('/../data/hesitant_i.csv')
s0.to_csv('/../data/supporters_i.csv')

del g0, h0, s0

## earlyCOVID

In [None]:
%%time
g1,h1,s1 = GHS(C1, cv1, tfidf1, W1, H1, nmf1, tw_idx=1, time_window='earlyCOVID')

In [None]:
g1.to_csv('/../data/general_ii.csv')
h1.to_csv('/../data/hesitant_ii.csv')
s1.to_csv('/../data/supporters_ii.csv')

del g1, h1, s1

## preVAX

In [None]:
%%time
g2,h2,s2 = GHS(C2, cv2, tfidf2, W2, H2, nmf2, tw_idx=2, time_window='preVAX')

In [None]:
g2.to_csv('/../data/general_iii.csv')
h2.to_csv('/../data/hesitant_iii.csv')
s2.to_csv('/../data/supporters_iii.csv')

del g2, h2, s2

## earlyVAX

In [None]:
%%time
g3,h3,s3 = GHS(C3, cv3, tfidf3, W3, H3, nmf3, tw_idx=3, time_window='earlyVAX')

In [None]:
g3.to_csv('/../data/general_iv.csv')
h3.to_csv('/../data/hesitant_iv.csv')
s3.to_csv('/../data/supporters_iv.csv')

del g3, h3, s3

## VAXdrive

In [None]:
%%time
g4,h4,s4 = GHS(C4, cv4, tfidf4, W4, H4, nmf4, tw_idx=4, time_window='VAXdrive')

In [None]:
g4.to_csv('/../data/general_v.csv')
h4.to_csv('/../data/hesitant_v.csv')
s4.to_csv('/../data/supporters_v.csv')

del g4, h4, s4

## lateVAX

In [None]:
%%time
g5,h5,s5 = GHS(C5, cv5, tfidf5, W5, H5, nmf5, tw_idx=5, time_window='lateVAX')

In [None]:
g5.to_csv('/../data/general_vi.csv')
h5.to_csv('/../data/hesitant_vi.csv')
s5.to_csv('/../data/supporters_vi.csv')

del g5, h5, s5