## Loading the Dataset

In [None]:
import pandas as pd
import csv
import numpy as np

# Loading the dataset
url_dataset = 'https://drive.google.com/file/d/187OUIh_ATswZHFskChn5jzfB41Od4gVp/view?usp=sharing'
url_dataset = 'https://drive.google.com/uc?id=' + url_dataset.split('/')[-2]
main_df = pd.read_csv(url_dataset, sep=',')
main_df

Unnamed: 0,ID,Timestamp (DD/MM/YY H:M:S),Tweet URL,Group,Collector,Category,Topic,Keywords,Account Handle,Account Name,...,"No. of Days since Arrival of First Batch of COVID-19 Vaccine Doses Committed by the COVAX Facility (Mar 04, 2021)","No. of Days since First Detected Cases of Omicron Variant in the Philippines (Aug 02, 2022)",Mentioned or Referenced COVID-19 Vaccine Brand,Mentioned or Referenced Other Vaccine/Drugs,Peddled Medical Adverse Side Effect,Distrust in Vaccine Development,"Racial, Religious, Cultural, Economic, or Socio-Political Keywords",Oppressive Keywords,Reviewer,Review
0,27-Jan,30/03/2023 10:23,https://twitter.com/ReplyCarlos1988/status/142...,27,"Castañeda, Rain",HLTH,"COVID-19 vaccines are not effective, can cause...","patay, bakuna",@ReplyCarlos1988,jcarlos88,...,166,-350,Sinovac/CoronaVac,,INEFFICIENCY,YES; Vaccine Trials,Duterte,salot,,
1,27-Feb,30/03/2023 11:43,https://twitter.com/DugAllci/status/1336856170...,27,"Castañeda, Rain",HLTH,"COVID-19 vaccines are not effective, can cause...","bakuna, COVID, bakit",@DugAllci,Đoug |☝️,...,-84,-600,,,ALLERGY,YES; Other Conspiracy,,,,
2,27-Mar,30/03/2023 12:40,https://twitter.com/frenchfernande2/status/136...,27,"Castañeda, Rain",HLTH,"COVID-19 vaccines are not effective, can cause...","astra, COVID, bakuna",@frenchfernande2,Morning Girl,...,0,-516,Sinovac/CoronaVac,,INEFFICIENCY,YES; Vaccine Distribution,"WHO, Covax Facility",inutil,,
3,27-Apr,30/03/2023 14:18,https://twitter.com/PaladinWars/status/1395323...,27,"Castañeda, Rain",HLTH,"COVID-19 vaccines are not effective, can cause...","astra, COVID, bakuna",@PaladinWars,Narcos Nightbreaker,...,77,-439,Sinovac/CoronaVac,,DEATH,YES; Vaccine Trials,"mga bansa, sa Pinas",,,
4,27-May,30/03/2023 15:00,https://twitter.com/6Vixxlight/status/12661783...,27,"Castañeda, Rain",HLTH,"COVID-19 vaccines are not effective, can cause...","side effect, bakuna",@6Vixxlight\n,🌱 VIXXlight ⁶ #VIXXIsSIX 🌺,...,-279,-795,,,OTHER AILMENT skin-turned-black,YES; Vaccine Trials,China,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,27-152,19/04/2023 11:07,https://twitter.com/k4ats/status/1490686441084...,27,"Castañeda, Rain",HLTH,"COVID-19 vaccines are not effective, can cause...","vaccine, masama, itigil",@k4ats,klk,...,340,-176,,,OTHER AILMENT health-degradation,YES; Other Conspiracy,Filipinos,,,
152,27-153,19/04/2023 11:15,https://twitter.com/k4ats/status/1490683118486...,27,"Castañeda, Rain",HLTH,"COVID-19 vaccines are not effective, can cause...","vax, not safe, COVID",@k4ats,klk,...,340,-176,,,INEFFICIENCY,YES; Vaccine Trials,,,,
153,27-154,19/04/2023 11:21,https://twitter.com/k4ats/status/1486089988026...,27,"Castañeda, Rain",HLTH,"COVID-19 vaccines are not effective, can cause...","pfizer, moderna, vaccines",@k4ats,klk,...,328,-188,Multiple,,DEATH,YES; Other Conspiracy,Bill Gates,,,
154,27-155,19/04/2023 11:39,https://twitter.com/k4ats/status/1484804948487...,27,"Castañeda, Rain",HLTH,"COVID-19 vaccines are not effective, can cause...","vaccine, death, COVID",@k4ats,klk,...,324,-192,,,DEATH,YES; Other Conspiracy,"our country, politician",criminals,,


## Preparing the Data

In [None]:
#Copying the main DataFrame
df = main_df.copy()

#dropping the empty rows from the main DataFrame copy
r = main_df.shape[0]
print(r)
for i in range(r):
    if pd.isnull(main_df.iloc[i]['Timestamp (DD/MM/YY H:M:S)']) == True:
        df = df.drop([i])
#dropping the negligible columns from the main DataFrame copy
df = df.drop(columns=['ID', 'Timestamp (DD/MM/YY H:M:S)', 'Tweet URL', 'Group', 'Collector', 'Category', 'Topic', 'Screenshot', 'Reviewer', 'Review'])

#function for handling missing values
def missing_values_handler():
    columns_with_missing_values = df.columns[df.isna().any()].tolist()
    print(columns_with_missing_values)

    #categorizing list of columns with missing values
    unstruct_missing_textual_columns = ['Account Bio', 'Tweet Translated', 'Remarks', 'Racial, Religious, Cultural, Economic, or Socio-Political Keywords', 'Oppressive Keywords']
    nominal_missing_data_columns = ['Mentioned or Referenced COVID-19 Vaccine Brand', 'Mentioned or Referenced Other Vaccine/Drugs']
    rational_missing_data_columns = ['Quote Tweets', 'Views']

    #filling in the missing values
    df[unstruct_missing_textual_columns] = df[unstruct_missing_textual_columns].fillna('')
    df[nominal_missing_data_columns] = df[nominal_missing_data_columns].fillna(0)
    df[rational_missing_data_columns] = df[rational_missing_data_columns].fillna(0)
    print(df.isnull().sum())
missing_values_handler()

#function for ensuring formatting consistency
def formatting_handler():
    #df.info()
    #categorizing list of all columns according to data levels
    unstruct_textual_data_columns = ['Keywords', 'Account Handle', 'Account Name', 'Account Bio', 'Tweet', 'Tweet Translated', 'Reasoning', 'Remarks', 'Racial, Religious, Cultural, Economic, or Socio-Political Keywords', 'Oppressive Keywords']
    nominal_data_columns = ['Account Type', 'Location', 'Tweet Type', 'Content Type', 'Rating', 'Mentioned or Referenced COVID-19 Vaccine Brand', 'Mentioned or Referenced Other Vaccine/Drugs', 'Peddled Medical Adverse Side Effect', 'Distrust in Vaccine Development']
    ordinal_data_columns = ['Joined (MM/YYYY)', 'Date Posted (DD/MM/YY H:M:S)']
    interval_data_columns = ['No. of Days since Philippines Joined the COVAX Facility (Jul 24, 2020)', 'No. of Days since FDA Approved the First COVID-19 Vaccine (Dec 11, 2020)', 'No. of Days since Arrival of First Batch of COVID-19 Vaccine Doses Committed by the COVAX Facility (Mar 04, 2021)', 'No. of Days since First Detected Cases of Omicron Variant in the Philippines (Aug 02, 2022)']
    rational_data_columns = ['Following', 'Followers', 'Likes', 'Replies', 'Retweets', 'Quote Tweets', 'Views']

    #converting and ensuring string columns
    df[nominal_data_columns] = df[nominal_data_columns].astype('string')

    #converting and ensuring datetime columns
    #df[ordinal_data_columns[0]] = pd.to_datetime(df[ordinal_data_columns[0]], format="%b-%y")
    #df[ordinal_data_columns[1]] = pd.to_datetime(df[ordinal_data_columns[1]], format="%d/%m/%Y %H:%M")

    #converting and ensuring numeric columns
    df[interval_data_columns] = df[interval_data_columns].astype('int64')
    for i in range(len(rational_data_columns)):
        for j in range(len(df.index)):
            x = str(df[rational_data_columns[i]][j])
            x = x.replace(',', '')
            df.loc[j, rational_data_columns[i]] = x
    df[rational_data_columns] = df[rational_data_columns].astype('float64')

    #converting and ensuring unstructured textual data columns
    df[unstruct_textual_data_columns] = df[unstruct_textual_data_columns].astype('string')
formatting_handler()
print("Number of tweets:", len(df), "\n")

#python dictionaries for mapping nominal data to int values
account_types_dict = {'Identified': 1, 'Anonymous': 2, 'Media': 3}
locations_dict = {'Philippines': 0, 'Republic of the Philippines': 0,
                  'NCR': 1, 'National Capital Region': 1, 'National Capital Region, Repub': 1,
                  'CAR': 2, 'Cordillera Administrative Region': 2,
                  'Region I': 3, 'Urdaneta, Pangasinan': 3, 'Ilocos Region': 3,
                  'Region II': 4, 'Cagayan Valley': 4,
                  'Region III': 5, 'Central Luzon': 5, 'San Antonio, Central Luzon': 5,
                  'Region IV-A': 6, 'CALABARZON': 6, 'Cavite Philippines': 6,
                  'Region IV-B': 7, 'MIMAROPA': 7,
                  'Region V': 8, 'Bicol Region': 8,
                  'Region VI': 9,
                  'Region VII': 10, 'Cebu City, Central Visayas': 10,
                  'Region VIII': 11,
                  'Region IX': 12, 'Zamboanga Peninsula': 12,
                  'Region X': 13, 'Region XI': 14,
                  'Region XII': 15, 'Isulan, SOCCSKSARGEN': 15, 'SOCCSKSARGEN': 15,
                  'Region XIII': 16, 'Caraga Region': 16,
                  'BARMM': 17, 'ARMM': 17, 'Bangsamoro': 17}
tweet_types_dict = {'Text': 1, 'Image': 2, 'Video': 3, 'URL': 4, 'Reply': 5}
content_types_dict = {'Rational': 1, 'Emotional': 2, 'Transactional': 3}
ratings_dict = {'0': 0, 'FAKE': 1, 'FALSE': 2,
                'MISLEADING': 3, 'UNPROVEN': 4,
                'INACCURATE': 5, 'NEED CONTEXT': 6,
                'FLIPFLOP': 7}
vaccine_brands_dict = {'None': 0, '0': 0, 'Pfizer': 1,
                       'AstraZeneca': 2, 'Sinopharm': 3, 'Covaxin': 4,
                       'Sinovac/CoronaVac': 5, 'Gamaleya/Sputnik': 6, 'Janssen/Jcovden': 7,
                       'Moderna/Spikevax': 8, 'Novavax/Covovax': 9, 'Multiple': 10}
other_vaccine_drugs_dict = {'None': 0, '0': 0, 'Dengvaxia': 1,
                            'Ivermectin': 2, 'Flu Vaccine': 3,
                            'Antibiotics': 4, 'Amoxicillin': 4}
vaccine_side_effects_dict = {'INFECTION': 1, 'ALLERGY': 2,
                             'DEATH': 3, 'COVID ITSELF': 4,
                             'POISONING': 5, 'POISONOUS': 5,
                             'OTHER AILMENT': 6, 'INEFFICIENCY': 7}
vaccine_distrusts_dict = {'NO': 0, 'YES; Vaccine Content': 1, 'YES; Vaccine Trials': 2,
                          'YES; Vaccine Distribution': 3, 'YES; Other Conspiracy': 4}

#lists for storing substrings under specific categorical data
addntl_locs_NCR_cities = []
addntl_locs_international = []
addntl_peddled_side_effects = []

#function for categorical data encoding
df_non_categ = df.copy()
def categ_data_encoder():
    nominal_data_columns = ['Account Type', 'Location', 'Tweet Type', 'Content Type', 'Rating', 'Mentioned or Referenced COVID-19 Vaccine Brand', 'Mentioned or Referenced Other Vaccine/Drugs', 'Peddled Medical Adverse Side Effect', 'Distrust in Vaccine Development']

    for i in range(len(nominal_data_columns)):
        for j in range(len(df.index)):
            x = df[nominal_data_columns[i]][j]

            if nominal_data_columns[i] == 'Account Type':
                x = account_types_dict.get(x)
            elif nominal_data_columns[i] == 'Location':
                if x in locations_dict or 'NCR, ' in x:
                    if 'NCR, ' in x:
                        x = x.split(', ')
                        city = x[1]
                        addntl_locs_NCR_cities.append(city)
                        x = locations_dict.get(x[0])
                    else:
                        x = locations_dict.get(x)
                else:
                    addntl_locs_international.append(x)
                    x = 18
            elif nominal_data_columns[i] == 'Tweet Type':
                x = tweet_types_dict.get(x)
            elif nominal_data_columns[i] == 'Content Type':
                temp = x
                temp = temp.split(', ')
                if len(temp) == 1:
                    x = content_types_dict.get(x)
                else:
                    x = x.split(', ')
                    x_a = ''
                    for k in range(len(x)):
                        x_b = content_types_dict.get(x[k])
                        x_a += str(x_b)
                    x = x_a
            elif nominal_data_columns[i] == 'Rating':
                x = ratings_dict.get(x)
            elif nominal_data_columns[i] == 'Mentioned or Referenced COVID-19 Vaccine Brand':
                x = vaccine_brands_dict.get(x)
            elif nominal_data_columns[i] == 'Mentioned or Referenced Other Vaccine/Drugs':
                x = other_vaccine_drugs_dict.get(x)
            elif nominal_data_columns[i] == 'Peddled Medical Adverse Side Effect':
                temp = x
                temp = temp.split(', ')
                if len(temp) == 1:
                    if 'OTHER AILMENT' in x:
                        ailments = x.split(' ')
                        ailments.pop(0)
                        ailments.pop(0)
                        for ail in ailments:
                            addntl_peddled_side_effects.append(ail)
                        x = vaccine_side_effects_dict.get('OTHER AILMENT')
                    else:
                        x = vaccine_side_effects_dict.get(x)
                else:
                    x = x.split(', ')
                    x_a = ''
                    for k in range(len(x)):
                        if 'OTHER AILMENT' in x[k]:
                            ailments = x[k].split(' ')
                            ailments.pop(0)
                            ailments.pop(0)
                            for ail in ailments:
                                addntl_peddled_side_effects.append(ail)
                            x_b = vaccine_side_effects_dict.get('OTHER AILMENT')
                        else:
                            x_b = vaccine_side_effects_dict.get(x[k])
                        x_a += str(x_b)
                    x = x_a
            elif nominal_data_columns[i] == 'Distrust in Vaccine Development':
                x = vaccine_distrusts_dict.get(x)

            df.loc[j, nominal_data_columns[i]] = str(x)
    df[nominal_data_columns] = df[nominal_data_columns].astype('int64')
categ_data_encoder()

ser_addntl_locs_NCR_cities = pd.Series(x for x in addntl_locs_NCR_cities).astype('string')
ser_addntl_locs_international = pd.Series(x for x in addntl_locs_international).astype('string')
ser_addntl_peddled_side_effects = pd.Series(x for x in addntl_peddled_side_effects).astype('string')
df.head()

156
['Account Bio', 'Tweet Translated', 'Quote Tweets', 'Views', 'Remarks']
Keywords                                                                                                             0
Account Handle                                                                                                       0
Account Name                                                                                                         0
Account Bio                                                                                                          0
Account Type                                                                                                         0
Joined (MM/YYYY)                                                                                                     0
Following                                                                                                            0
Followers                                                                                  

Unnamed: 0,Keywords,Account Handle,Account Name,Account Bio,Account Type,Joined (MM/YYYY),Following,Followers,Location,Context,...,"No. of Days since Philippines Joined the COVAX Facility (Jul 24, 2020)","No. of Days since FDA Approved the First COVID-19 Vaccine (Dec 11, 2020)","No. of Days since Arrival of First Batch of COVID-19 Vaccine Doses Committed by the COVAX Facility (Mar 04, 2021)","No. of Days since First Detected Cases of Omicron Variant in the Philippines (Aug 02, 2022)",Mentioned or Referenced COVID-19 Vaccine Brand,Mentioned or Referenced Other Vaccine/Drugs,Peddled Medical Adverse Side Effect,Distrust in Vaccine Development,"Racial, Religious, Cultural, Economic, or Socio-Political Keywords",Oppressive Keywords
0,"patay, bakuna",@ReplyCarlos1988,jcarlos88,,2,Apr-09,824.0,591.0,0,Socio-political,...,389,249,166,-350,5,0,7,2,Duterte,salot
1,"bakuna, COVID, bakit",@DugAllci,Đoug |☝️,Patriot Games,2,Nov-11,396.0,358.0,6,,...,139,-1,-84,-600,0,0,2,4,,
2,"astra, COVID, bakuna",@frenchfernande2,Morning Girl,I love my country second to our Lord God,2,Jan-20,3295.0,2212.0,0,Socio-political,...,223,83,0,-516,5,0,7,3,"WHO, Covax Facility",inutil
3,"astra, COVID, bakuna",@PaladinWars,Narcos Nightbreaker,Shagidi Shagidi #ShaBongbong,2,Sep-12,467.0,51.0,0,Economic,...,300,160,77,-439,5,0,3,2,"mga bansa, sa Pinas",
4,"side effect, bakuna",@6Vixxlight,🌱 VIXXlight ⁶ #VIXXIsSIX 🌺,I'm the MatPat of the Starlight Community and ...,2,Aug-17,331.0,373.0,0,Racial,...,-56,-196,-279,-795,0,0,6,2,China,


In [None]:
#functions for normalization, standardization, and scaling
def data_minmax_scaler(datafr):
    interval_data_columns = ['No. of Days since Philippines Joined the COVAX Facility (Jul 24, 2020)', 'No. of Days since FDA Approved the First COVID-19 Vaccine (Dec 11, 2020)', 'No. of Days since Arrival of First Batch of COVID-19 Vaccine Doses Committed by the COVAX Facility (Mar 04, 2021)', 'No. of Days since First Detected Cases of Omicron Variant in the Philippines (Aug 02, 2022)']
    rational_data_columns = ['Following', 'Followers', 'Likes', 'Replies', 'Retweets', 'Quote Tweets', 'Views']

    for i in range(len(interval_data_columns)):
        col = interval_data_columns[i]
        x_min = datafr[col].min()
        x_max = datafr[col].max()
        for j in range(len(datafr.index)):
            x = datafr[col][j]
            x_norm = (x - x_min) / (x_max - x_min)
            datafr.loc[j, col] = x_norm
    for i in range(len(rational_data_columns)):
        col = rational_data_columns[i]
        x_min = datafr[col].min()
        x_max = datafr[col].max()
        for j in range(len(datafr.index)):
            x = datafr[col][j]
            x_norm = (x - x_min) / (x_max - x_min)
            datafr.loc[j, col] = x_norm
    return datafr

def data_standardizer(datafr):
    interval_data_columns = ['No. of Days since Philippines Joined the COVAX Facility (Jul 24, 2020)', 'No. of Days since FDA Approved the First COVID-19 Vaccine (Dec 11, 2020)', 'No. of Days since Arrival of First Batch of COVID-19 Vaccine Doses Committed by the COVAX Facility (Mar 04, 2021)', 'No. of Days since First Detected Cases of Omicron Variant in the Philippines (Aug 02, 2022)']
    rational_data_columns = ['Following', 'Followers', 'Likes', 'Replies', 'Retweets', 'Quote Tweets', 'Views']
    from scipy.stats import shapiro

    for i in range(len(interval_data_columns)):
        col = interval_data_columns[i]
        statistics, p_value = shapiro(datafr[col])
        if p_value > 0.05:
            #print("Likely to be Normally Distributed: p-value is %f", p_value)
            mean = datafr[col].mean()
            std = datafr[col].std()
            for j in range(len(datafr.index)):
                x = datafr[col][j]
                z_score = (x - mean)/std
                datafr.loc[j, col] = z_score
        else:
            #print("Not Likely to be Normally Distributed: p-value is %f", p_value)
            mean = datafr[col].mean()
            std = datafr[col].std()
            for j in range(len(datafr.index)):
                x = datafr[col][j]
                z_score = (x - mean)/std
                datafr.loc[j, col] = z_score
    for i in range(len(rational_data_columns)):
        col = rational_data_columns[i]
        statistics, p_value = shapiro(datafr[col])
        if p_value > 0.05:
            #print("Likely to be Normally Distributed: p-value is %f", p_value)
            mean = datafr[col].mean()
            std = datafr[col].std()
            for j in range(len(datafr.index)):
                x = datafr[col][j]
                z_score = (x - mean)/std
                datafr.loc[j, col] = z_score
        else:
            #print("Not Likely to be Normally Distributed: p-value is %f", p_value)
            mean = datafr[col].mean()
            std = datafr[col].std()
            for j in range(len(datafr.index)):
                x = datafr[col][j]
                z_score = (x - mean)/std
                datafr.loc[j, col] = z_score
    return datafr

def data_power_transformer(datafr):
    interval_data_columns = ['No. of Days since Philippines Joined the COVAX Facility (Jul 24, 2020)', 'No. of Days since FDA Approved the First COVID-19 Vaccine (Dec 11, 2020)', 'No. of Days since Arrival of First Batch of COVID-19 Vaccine Doses Committed by the COVAX Facility (Mar 04, 2021)', 'No. of Days since First Detected Cases of Omicron Variant in the Philippines (Aug 02, 2022)']
    rational_data_columns = ['Following', 'Followers', 'Likes', 'Replies', 'Retweets', 'Quote Tweets', 'Views']
    from scipy.stats import yeojohnson

    for i in range(len(interval_data_columns)):
        col = interval_data_columns[i]
        datafr[col], _ = yeojohnson(datafr[col])
    for i in range(len(rational_data_columns)):
        col = rational_data_columns[i]
        datafr[col], _ = yeojohnson(datafr[col])
    return datafr

def data_unit_vector_scaler(datafr):
    interval_data_columns = ['No. of Days since Philippines Joined the COVAX Facility (Jul 24, 2020)', 'No. of Days since FDA Approved the First COVID-19 Vaccine (Dec 11, 2020)', 'No. of Days since Arrival of First Batch of COVID-19 Vaccine Doses Committed by the COVAX Facility (Mar 04, 2021)', 'No. of Days since First Detected Cases of Omicron Variant in the Philippines (Aug 02, 2022)']
    rational_data_columns = ['Following', 'Followers', 'Likes', 'Replies', 'Retweets', 'Quote Tweets', 'Views']

    for i in range(len(interval_data_columns)):
        col = interval_data_columns[i]
        euclid_norm = np.linalg.norm(datafr[col])
        for j in range(len(datafr.index)):
            x = datafr[col][j]
            new_x = x/euclid_norm
            datafr.loc[j, col] = new_x
    for i in range(len(rational_data_columns)):
        col = rational_data_columns[i]
        euclid_norm = np.linalg.norm(datafr[col])
        for j in range(len(datafr.index)):
            x = datafr[col][j]
            new_x = x/euclid_norm
            datafr.loc[j, col] = new_x
    return datafr

df_minmax_scaled = data_minmax_scaler(df.copy())
df_standardized = data_standardizer(df.copy())
df_power_transformed = data_power_transformer(df.copy())
df_unit_vector_scaled = data_unit_vector_scaler(df.copy())
print(df_minmax_scaled[['No. of Days since Philippines Joined the COVAX Facility (Jul 24, 2020)', 'Following']])
print(df_standardized[['No. of Days since Philippines Joined the COVAX Facility (Jul 24, 2020)', 'Following']])
print(df_power_transformed[['No. of Days since Philippines Joined the COVAX Facility (Jul 24, 2020)', 'Following']])
print(df_unit_vector_scaled[['No. of Days since Philippines Joined the COVAX Facility (Jul 24, 2020)', 'Following']])


     No. of Days since Philippines Joined the COVAX Facility (Jul 24, 2020)  \
0                                             0.454082                        
1                                             0.198980                        
2                                             0.284694                        
3                                             0.363265                        
4                                             0.000000                        
..                                                 ...                        
151                                           0.631633                        
152                                           0.631633                        
153                                           0.619388                        
154                                           0.615306                        
155                                           0.057143                        

     Following  
0     0.194005  
1     0.092990  


## Topic Clustering

### Performing clustering

In [None]:
%%capture
# !pip install pyLDAvis
!pip install emoji --upgrade

# Initialize NLP components
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

from textblob import TextBlob

!pip install pyspellchecker
from spellchecker import SpellChecker

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
spell = SpellChecker()


# Topic modeling via LDA
# Source: https://www.kaggle.com/code/infamouscoder/lda-topic-modeling-features
import re
import emoji

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import decomposition

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Custom tokenizer
def tokenizer(text):
  text = emoji.replace_emoji(text, replace='')    # remove emojis
  text = re.sub(r"http\S+", "", text)             # remove URLs
  text = re.sub(r"[^\w\s]", "", text)             # remove whitespaces
  tokens = [word for word in word_tokenize(text) if len(word)>3]                           # keep only 4+-length words
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
  filtered_tokens = [token for token in lemmatized_tokens if token.lower() not in stop_words]
  # stemmed_tokens = [stemmer.stem(item) for item in tokens]
  return filtered_tokens

# Generate features
tf_vectorizer = TfidfVectorizer(tokenizer=tokenizer,
                                max_df=0.75, max_features=10000,
                                use_idf=True, norm=None, token_pattern=None)
tf_vectors = tf_vectorizer.fit_transform(df['Tweet'])

# Create top 10 topics
n_topics = 5
lda = decomposition.LatentDirichletAllocation(n_components=n_topics, max_iter=10,
                                              learning_method='online', learning_offset=50, n_jobs=1, random_state=42)
W = lda.fit_transform(tf_vectors)
H = lda.components_

# Show top 15 relevant words for each of the 25 topics
num_words = 15
vocab = np.array(tf_vectorizer.get_feature_names_out())
top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H])
topics = [' '.join(t) for t in topic_words]
df_topics = pd.DataFrame(topics, columns=['Keywords'])
df_topics['Topic ID'] = range(1, len(topics) + 1)
print(df_topics)

###Visualizing the model

In [None]:
# Assign topic to each tweet
topicid = ["Topic" + str(i+1) for i in range(lda.n_components)]
tweetid = ["Tweet" + str(i+1) for i in range(len(df['Tweet']))]
#print(len(df['Tweet']))

df_topics_lda = pd.DataFrame(np.round(W,2), columns=topicid, index=tweetid)
significanttopic = np.argmax(df_topics_lda.values, axis=1)+1

df_topics_lda['dominant_topic'] = significanttopic
df_topics_lda['breakdown'] = df_topics_lda.apply(lambda row: '\n'.join([f'{col}: {row[col]}'
                                                        for col in sorted(df_topics_lda.columns, key=lambda x: row[x], reverse=True)
                                                        if row[col] > 0 and col != 'dominant_topic']), axis=1)
df_topics_lda.head(10)

Unnamed: 0,Topic1,Topic2,Topic3,Topic4,Topic5,dominant_topic,breakdown
Tweet1,0.96,0.0,0.0,0.03,0.0,1,Topic1: 0.96\nTopic4: 0.03
Tweet2,0.0,0.99,0.0,0.0,0.0,2,Topic2: 0.99
Tweet3,0.0,0.99,0.0,0.0,0.0,2,Topic2: 0.99
Tweet4,0.0,0.83,0.0,0.17,0.0,2,Topic2: 0.83\nTopic4: 0.17
Tweet5,0.0,0.0,0.0,0.0,0.99,5,Topic5: 0.99
Tweet6,0.0,0.0,0.0,0.99,0.0,4,Topic4: 0.99
Tweet7,0.0,0.99,0.0,0.0,0.0,2,Topic2: 0.99
Tweet8,0.0,0.0,0.0,0.99,0.0,4,Topic4: 0.99
Tweet9,0.0,0.99,0.0,0.0,0.0,2,Topic2: 0.99
Tweet10,0.0,0.0,0.0,0.0,0.99,5,Topic5: 0.99


In [None]:
# Visualize topics
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import plotly.express as px

# Apply t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(df_topics_lda.iloc[:,:5])

# Apply K-means clustering
n_topics = 5
kmeans = KMeans(n_clusters=n_topics, n_init=10, random_state=42)
cluster_labels = kmeans.fit_predict(df_topics_lda.iloc[:,:5])

In [None]:
# Create a new dataframe with t-SNE coordinates and cluster labels
import textwrap

def split_text(text, max_length):
  lines = textwrap.wrap(text, width=max_length, break_long_words=False)
  return "<br>".join(lines)

df_topics_cluster = pd.DataFrame({'X': tsne_result[:, 0],
                                  'Y': tsne_result[:, 1],
                                  'Tweet': df['Tweet'],
                                  'Cluster': df_topics_lda.reset_index()['dominant_topic'].astype(str), # topics via LDA
                                  # 'Cluster': cluster_labels},                                         # clusters via K-means
                                  'Breakdown': df_topics_lda.reset_index()['breakdown']})

df_topics_cluster['Tweet'] = df_topics_cluster['Tweet'].apply(lambda x: split_text(x, 40))
df_topics_cluster['Breakdown'] = df_topics_cluster['Breakdown'].str.replace('\n','<br>')

print(len(df_topics_cluster))
df_topics_cluster.head(10)


156


Unnamed: 0,X,Y,Tweet,Cluster,Breakdown
0,-82.792419,192.973587,#DuterteTraydor Patay na mga Pilipino<br>dahil...,1,Topic1: 0.96<br>Topic4: 0.03
1,179.84668,164.874649,"Shocking pare. Natrangkaso(covid) na<br>ako, b...",2,Topic2: 0.99
2,179.84668,164.874649,Tanong lang bakit lahat ng vaccines<br>dadaan ...,2,Topic2: 0.99
3,114.987198,35.548676,yung mga bansa na puro Pfizer at Moderna<br>at...,2,Topic2: 0.83<br>Topic4: 0.17
4,-274.021271,4.593821,"Oh yes, hintayin natin ang bakuna ng<br>China ...",5,Topic5: 0.99
5,250.000961,-161.356628,Bakit may pag-aalinlangan sa bakunang<br>galin...,4,Topic4: 0.99
6,179.84668,164.874649,Sinong niloko mo. Alam naman natin na sa<br>BF...,2,Topic2: 0.99
7,250.000961,-161.356628,Ganun na nga po! Ang kaso ni isang page<br>ng ...,4,Topic4: 0.99
8,179.84668,164.874649,ang problema dyan hindi pa proven and<br>baka ...,2,Topic2: 0.99
9,-274.021271,4.593821,We need to review efficacy and safety<br>data....,5,Topic5: 0.99


In [None]:
# Plot tweets as colored points
df_topics_cluster.sort_values('Cluster', key=lambda x: pd.to_numeric(x, errors='coerce'), inplace=True)

fig = px.scatter(df_topics_cluster, x='X', y='Y', color='Cluster', symbol = 'Cluster',
                 title='Topic Clustering using LDA and t-SNE',
                 hover_name='Tweet',
                 hover_data={'X':False, 'Y':False, 'Cluster':False, 'Tweet':False, 'Breakdown':True},)

for i, keyword in enumerate(df_topics['Keywords']):
  fig.add_annotation(
    x=0,
    y=-0.2*(i/5)-0.1,
    text="Topic %d: %s"%(i+1, keyword.replace(' ', ', ')),
    showarrow=False,
    xref='paper',
    yref='paper',
    align='left',
    font=dict(color=fig.data[i].marker['color'])
  )

fig.update_traces(marker_size = 15)

fig.update_layout(height=650,
                  xaxis_title='', yaxis_title='',
                  margin=dict(b=150, r=150),
                  paper_bgcolor='#2c3e50',
                  title=dict(font=dict(color='white', size = 30)),
                  legend=dict(title="Topic", font=dict(color='white')),
                  font = dict(size=15))
fig.show()

##Statistical Analysis (Chi-square Test)

In [None]:
print("Total Likes: ", df['Likes'].sum())
# Select the variables for the Chi-square test
socio_pol_likes = df.loc[df['Context']=='Socio-political', 'Likes'].dropna()
print("Socio-political: ", socio_pol_likes.sum())

econ_likes = df.loc[df['Context']=='Economic', 'Likes'].dropna()
print("Economic: ", econ_likes.sum())

racial_likes = df.loc[df['Context']=='Racial', 'Likes'].dropna()
print("Racial: ", racial_likes.sum())

religion_likes = df.loc[df['Context']=='Religion', 'Likes'].dropna()
print("Religion: ", religion_likes.sum())

culture_likes = df.loc[df['Context']=='Culture-based', 'Likes'].dropna()
print("Culture-based: ", culture_likes.sum())

none_likes = df.loc[df['Context']=='None', 'Likes'].dropna()
print("None: ", none_likes.sum())

print()

# Make new Dataframe for the number of tweets for each context
print("Total Tweets:")
print(df['Context'].value_counts())
data = {
    'Observed': [df['Context'].value_counts()['Socio-political'], df['Context'].value_counts()['Economic'], df['Context'].value_counts()['Racial'], 0, df['Context'].value_counts()['Culture-based'], df['Context'].value_counts()['None']],
    'Expected': [26, 26, 26, 26, 26, 26]
    }
tweets_df = pd.DataFrame(data, index=['Socio-political', 'Economic', 'Racial', 'Religion', 'Culture-based', 'None'])
print(tweets_df)

Total Likes:  495.0
Socio-political:  58.0
Economic:  28.0
Racial:  142.0
Religion:  0.0
Culture-based:  2.0
None:  265.0

Total Tweets:
None               69
Racial             32
Socio-political    27
Economic           23
Culture-based       5
Name: Context, dtype: int64
                 Observed  Expected
Socio-political        27        26
Economic               23        26
Racial                 32        26
Religion                0        26
Culture-based           5        26
None                   69        26


In [None]:
import scipy.stats as stats

# Perform chi-square test
chi2, p_value = stats.chisquare(tweets_df['Observed'], f_exp = tweets_df['Expected'])

# Print the results
print(f"Chi-square statistic: {chi2:}")
print(f"P-value: {p_value:}")
#print("Degrees of freedom:", dof)
#print("Expected frequencies:\n", pd.DataFrame(expected, index=contingency_table.index, columns=contingency_table.columns))

Chi-square statistic: 115.84615384615384
P-value: 2.377748360541489e-23


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


# Plot the distributions
chi2_table = contingency_table.reset_index()
chi2_table = pd.melt(chi2_table, id_vars='index', var_name=['Distrust in Vaccine Development'], value_name='Count')
chi2_table.rename(columns = {'index':'Vaccine Brand'}, inplace = True)
chi2_table.replace({'NO': 0, 'YES; Vaccine Content': 1, 'YES; Vaccine Trials': 2,'YES; Vaccine Distribution': 3, 'YES; Other Conspiracy': 4}, inplace = True)

sns.barplot(x='Vaccine Brand', y='Count', hue='Distrust in Vaccine Development', data=chi2_table, linewidth=4)

plt.xlabel("Vaccine Brand")
plt.ylabel("Count")
plt.title("Titanic Passenger Class vs. Survival", fontsize=18, pad=20)
plt.show()

##Statistical Analysis (ANOVA)

In [None]:
from scipy.stats import f_oneway

df.head()
print("Total Likes: ", df['Likes'].sum())
# Select the variables for the ANOVA test
socio_pol_likes = df.loc[df['Context']=='Socio-political', 'Likes'].dropna()
print("Socio-political: ", socio_pol_likes.sum())

econ_likes = df.loc[df['Context']=='Economic', 'Likes'].dropna()
print("Economic: ", econ_likes.sum())

racial_likes = df.loc[df['Context']=='Racial', 'Likes'].dropna()
print("Racial: ", racial_likes.sum())

religion_likes = df.loc[df['Context']=='Religion', 'Likes'].dropna()
print("Religion: ", religion_likes.sum())

culture_likes = df.loc[df['Context']=='Culture-based', 'Likes'].dropna()
print("Culture-based: ", culture_likes.sum())

#health_likes = df.loc[df['Context']=='Health', 'Likes'].dropna()
#print("Health: ", health_likes.sum())

none_likes = df.loc[df['Context']=='None', 'Likes'].dropna()
print("None: ", none_likes.sum())


# Perform the ANOVA test
f_statistic, p_value = f_oneway(socio_pol_likes, econ_likes, racial_likes, culture_likes, none_likes)

# Print the results
print("F-Statistic:", f_statistic)
print("P-value:", p_value)

Total Likes:  495.0
Socio-political:  58.0
Economic:  28.0
Racial:  142.0
Religion:  0.0
Culture-based:  2.0
None:  265.0
F-Statistic: 0.2947666906784956
P-value: 0.8810148059444238


In [None]:
# Plot the distributions
sns.boxplot(x='Context', y='Likes', data=df)
plt.xlabel('Context')
plt.ylabel('Likes')
plt.title("Context by Likes", fontsize=18, pad=20)
plt.show()

In [None]:
# Normality test
from scipy.stats import shapiro
_, c1_pvalue = shapiro(socio_pol_likes)
_, c2_pvalue = shapiro(econ_likes)
_, c3_pvalue = shapiro(racial_likes)
_, c4_pvalue = shapiro(health_likes)
_, c5_pvalue = shapiro(culture_likes)
_, c6_pvalue = shapiro(none_likes)

print("Socio-political Likes p-value:", c1_pvalue)
print("Economic Likes p-value:", c2_pvalue)
print("Racial Likes p-value:", c3_pvalue)
print("Health Likes p-value:", c4_pvalue)
print("Culture-based Likes p-value:", c5_pvalue)
print("No Context Likes p-value:", c6_pvalue)




# Plot distributions
sns.histplot(socio_pol_likes, color='blue', label='Socio-political', bins=20, alpha=0.5)
sns.histplot(econ_likes, color='orange', label='Economic', bins=20, alpha=0.5)
sns.histplot(racial_likes, color='green', label='Racial', bins=20, alpha=0.5)
sns.histplot(health_likes, color='red', label='Health', bins=20, alpha=0.5)
sns.histplot(culture_likes, color='yellow', label='Culture', bins=20, alpha=0.5)
sns.histplot(none_likes, color='black', label='None', bins=20, alpha=0.5)


plt.xlabel('Context')
plt.ylabel('Likes')
plt.title('Histograms of Age for Passenger Classes', fontsize=18, pad=20)
plt.legend()
plt.show()

In [None]:
from scipy.stats import kruskal

# Perform the Kruskal-Wallis test
h_statistic, p_value = kruskal(socio_pol_likes, econ_likes, racial_likes, health_likes, culture_likes, none_likes)

# Print the results
print("H-Statistic:", h_statistic)
print("P-value:", p_value)