## Appendix

In [None]:
"""
Below two functions were modified from 
https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/main/Recent-Search/recent_search.py:

1. bearer_oauth(r)
2. connect_to_endpoint(url, params)

Others were written by Yi-ching, Hung 

Last updated: 2022-Jan-24th

"""

#collect, process, store data
import requests
import contractions
import pandas as pd
import re
import datetime
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

bearer_token = '(bearer token)'
search_url = "https://api.twitter.com/2/tweets/search/recent"

#the target brands 
brand_ls = ["Moderna","AstraZeneca","Pfizer"]


#get data from Twitter API:
def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2RecentSearchPython"
    return r

def connect_to_endpoint(url, params):
    """
    get response from endpoint
    """
    
    response = requests.get(url, auth=bearer_oauth, params=params)
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    return response.json()



def get(start, end):
    """
    get tweets data using params
    start/end: start/end time, example format: '2021-12-30T00:00:00Z'
    
    """
    #dictionary containing three brands
    j_d=dict()
    
    for brand in brand_ls: #brand_ls = ["Moderna","AstraZeneca","Pfizer"]
        j_d[brand]=[]
        
    
    for brand in brand_ls: 
        
        query_params = {'query': f'({brand} OR #{brand} OR #{brand.lower()} OR {brand.lower()}) lang:en -is:retweet',
                        'max_results': '100','tweet.fields':'created_at','start_time':f'{start}',
                        'end_time':f'{end}'}
        
        json_response = connect_to_endpoint(search_url, query_params)
        
        j_d[brand].append(json_response)

    return j_d 


#data preprocessing
def data_prepross(j_d): #j_d = json dict gotten from get()
    
    #create dict to contain target twitter data
    text_d = dict()
    for brand in brand_ls:
        text_d[brand] = [[],[],[],[]] #4 target data

    
    for brand in j_d.keys():
        for d in j_d[brand][0]['data']: # for d in a list of dictionaries(each is one tweet)
            
            #text_d[brand][0] data to be processed, [1] raw data,[2]date,[3]id
            text_d[brand][0].append(d['text'])  #append a tweet string at a time: list of tweet strings
            text_d[brand][1].append(d['text'])
            text_d[brand][2].append(d['created_at'])
            text_d[brand][3].append(d['id'])

    #take out urls, #, and @
    for brand in text_d.keys():#three brands
        for i in range(len(text_d[brand][0])): #the to-be-processed text list
            txt = text_d[brand][0][i] #for each string in text list e.g.,"Moderna vaccine":[[s,s,s,s],[],[],[]]
            txt2=re.sub('https://\S+|@\S+|#\S+|&amp;','', txt)
            txt3=re.sub('\n',' ', txt2) #convert \n to space
            text_d[brand][0][i] = txt3 #"Moderna vaccine":[[txt],[],[],[]]
    
    
    #expand         
    for brand in text_d.keys(): #three brands
        for i in range(len(text_d[brand][0])): #for each tweet in text list for a brand            
            line = text_d[brand][0][i]
            
            expanded_words = []
            for word in line.split(): #to fix each word in a list of words from a line
                
                # using contractions.fix to expand the shortened words
                expanded_words.append(contractions.fix(word))  #save fixed words in a list 
            
            fixed_line = ' '.join(expanded_words) #convert the list back to fixed line           
            text_d[brand][0][i] = fixed_line 

    
    #remove duplicates (change duplicates to '')
    dupli_d={}
    for brand in text_d.keys(): 
        count=0
        print('total number of tweet for ',brand, ': ', len(text_d[brand][0]),'\n') #how many strings in the list
        
        for i in range(len(text_d[brand][0])):#len of a brand list(the number of tweets)
            line = text_d[brand][0][i] #a string in the list
            
            if line in dupli_d.keys():
                count +=1
                text_d[brand][0][i]='' #turn a duplicated string in the text list to empty string
                text_d[brand][1][i]='' #turn a duplicated string in the raw list to empty string          
            if line not in dupli_d:
                dupli_d[line]=1
 
    
    #remove len <= 2 tweets (including duplicates)
    res_d=dict()
    for brand in brand_ls:#brand_ls = ["Moderna","AstraZeneca","Pfizer"]
        res_d[brand]=[[],[],[],[]]
    
    for brand in text_d.keys(): #3 brands
        count=0

        for i in range(len(text_d[brand][0])): #range(number of tweets for a brand)
            line = text_d[brand][0][i]
            raw = text_d[brand][1][i]
            date = text_d[brand][2][i]
            uid = text_d[brand][3][i]

            if len(line.split())>2: #if a string has more than two words
                res_d[brand][0].append(line) #text to be processed
                res_d[brand][1].append(raw) #raw text
                res_d[brand][2].append(date)
                res_d[brand][3].append(uid)
            else:
                count+=1
                
        print('No enough content or duplicates: ', brand,' ', count)
        print('Total tweets left: ', len(res_d[brand][0]),'\n')
                

    return res_d #return a dictionary: {processed data, raw data, date, uid}
     


#sentiment analysis:
def to_sent_df(dic, date, sentiment_df): 
    """
    1. check if newly-collected tweets are in existing df before processing
    2. do senti analysis to get a score, and append to senti_df
    dic = res_d
    """
    
    try:
        df = pd.read_csv(sentiment_df) #sentiment analysis df
        
    except FileNotFoundError:
        answer = input('File not found, create a new one in current directory? Y or N')
        if answer.lower() == 'y':
            df = pd.DataFrame(columns=['brand','date','senti','score','text',
                                   'raw_text','uid','created_datetime'])
        else:
            print('A file has to be created before running this function')
            return 'A file has to be created before running this function'

    sent_d = {'neutral':{'count':0, 'data':[]},
              'negative':{'count':0, 'data':[]},
              'positive':{'count':0, 'data':[]}} #dict with sentiment, count pos,neu,neg number

    #check duplication
    for brand in dic.keys(): 
        #{"Moderna vaccine":[[s,s,s],[],[],[]],"AstraZeneca vaccine":[[],[],[],[]],"Pfizer vaccine":[[],[],[],[]]
        
        count_old=0
        count_new=0
        
        for i in range(len(dic[brand][0])): #for each string index in string list for a brand
            s = dic[brand][0][i]  #a tweet string
            raw = dic[brand][1][i]
            created_datetime = dic[brand][2][i] #created_date 
            uid = dic[brand][3][i] #user id
            
            if s.lower() in df[df['brand']==brand].text.values: #check if string is in the existing df   
                count_old+=1

                
            elif s.lower() not in df[df['brand']==brand].text.values:#if string is new, continue
                count_new+=1
                
                d = sia.polarity_scores(s) #score for a sentence
                num_sum = d['compound']

                if num_sum < 0.05 and num_sum > (-0.05):
                    senti='neutral'
                    sent_d['neutral']['count'] += 1
                
                elif num_sum < (-0.05):
                    senti='negative'
                    sent_d['negative']['count'] += 1
                
                elif num_sum > 0.05:
                    senti='positive'
                    sent_d['positive']['count'] += 1             
                
                df = df.append({'brand':brand,'date':date,'senti':senti,'score':num_sum,'text':s.lower(),
                               'raw_text':raw,'uid':uid,'created_datetime':created_datetime},ignore_index=True)
        print(brand,'duplicated in senti data: ', count_old, ' new: ', count_new, '\n\n')
    print('df_senti tail:\n', df.tail())
    df.to_csv(sentiment_df, index=False) # save to sentiment analysis df with scores
    print(f'\ndata saved to dataframe {sentiment_df}')
    


def main(): 
    """
    Get data, process, analyze, store data
    
    need to specify start/end time, and the dataframe name
    
    """
 
    
    ##### specify start/end time, and the dataframe name here: ####
    
    starttime = datetime.datetime.utcnow()- datetime.timedelta(hours=24)
    endtime = starttime + datetime.timedelta(hours=23)
    sentiment_df = 'sentiment_df'
    
    #############################
    
    start = starttime.isoformat(timespec='seconds')+'.000Z'
    end = endtime.isoformat(timespec='seconds')+'.000Z'
    print('StartTime: ',start)
    print('EndTime: ',end)    
    date = start.split('T')[0]        
    
    #start running
    get_d = get(start,end)
    preprossed_d = data_prepross(get_d) 
    to_sent_df(preprossed_d,date,sentiment_df )
    

    
 


In [None]:
main()

In [None]:
#visualisation

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.probability import FreqDist
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
import nltk
import re

class Sentiment_df_viz():
    
    """

    Visualisation of sentiment_df

    """
    
    def __init__(self,df, startdate = None, enddate= None):
        """
        startdate: specify a date to get data on and after that date
        
        date example: '2022-01-04'
        
        """
        self.df =pd.read_csv(df)
        
        if (startdate != None) and (enddate != None):
            self.df = self.df[(self.df['date'] >= startdate) & (self.df['date'] <= enddate) ]
        elif startdate != None:
            self.df = self.df[self.df['date'] >= startdate]
        elif enddate != None:
            self.df = self.df[self.df['date'] <= enddate]
    
    def df_sum(self):
        """
        get stats for each brand
        
        """
        
        df1 = self.df

        #count numbers of senti and change colnams to 'number'
        df_count = df1.groupby(['brand','date','senti']).count().reset_index()
        df_count = df_count.rename(columns={'score':'number'})

        #group by again to get daily percentage 
        df_sum = df_count.groupby(['brand','date','senti']).sum()

        #get percentage
        df_sum['percentage']= df_sum.groupby(level=(0,1))['number'].apply(lambda x: x*100/x.sum())
        #print('\n\n Daily percentage:\n ',df_sum.tail(3))

        #reset to sort values
        df_sum = df_sum.reset_index()

        results =df_sum.groupby(['brand','date']).sum().reset_index()

        return df_sum        
    
    def print_stats(self):
        """
        print out the number of tweets per date for each brand
        
        """
        
        df_sum = self.df_sum()


        results =df_sum.groupby(['brand','date']).sum().reset_index()
        print('\n\n Sum for brand and date:\n ')
        print('AstraZeneca: \n',results[results['brand']=='AstraZeneca'][['date','text']])
        print('Pfizer: \n',results[results['brand']=='Pfizer'][['date','text']])
        print('Moderna: \n',results[results['brand']=='Moderna'][['date','text']])

        print('\n\n Total tweets ',df_sum.number.sum())

    
    def line_plots(self):
        """
        take the output of print_stats as variable
        """
        df = self.df_sum()
        brandls = ['AstraZeneca','Moderna','Pfizer']
        brandnum = len(brandls)
        fig, axes = plt.subplots(brandnum,1, figsize=(15,18),dpi=100)
        fig.tight_layout(pad=12.0)

        for i in range(len(brandls)):
            A = df[df['brand']== brandls[i]]
            axes[i].set_title(f'{brandls[i]}: Sentiment Change over Time', fontsize=20)
            #sns.scatterplot(ax=axes[0],x=A.date, y=A.percentage, hue=A.senti);
            sns.lineplot(ax=axes[i], x=A.date, y=A.percentage, hue=A.senti, palette=['r','b','g']);
            axes[i].set_ylabel('Percentage', fontsize=20)
            axes[i].set_xlabel('Date', fontsize=20)
            axes[i].set(ylim=(0,100))
            axes[i].legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0., fontsize=20)
            axes[i].tick_params(axis = 'both', which = 'major', labelsize = 12, rotation=45)


    def brand_senti_percentage(self):
        """
        sentiment percentage for each brand
        
        """
        sen = self.df

        #get senti percentage
        sen_group = sen.groupby(['brand','senti']).count()
        sen = sen_group.groupby(level=(0)).apply(lambda x: x*100/x.sum())['date'].reset_index()
        sen = sen.rename(columns={'date':'percentage'})


        fig, ax=plt.subplots(figsize=(8,6))    
        sns.barplot(x=sen.brand,y=sen.percentage, hue=sen.senti,ax=ax,
                    palette={'negative':'#E60965','positive':'#77D970',
                                                          'neutral':'#84DFFF'})
        ax.tick_params(labelsize=12)
        ax.set_xlabel('Sentiment',fontsize=15)
        ax.set_ylabel('Percentage',fontsize=15)
        plt.legend(bbox_to_anchor=(1.01,1),fontsize=12)
        plt.title('Percentage of Sentiment Count for each Brand', fontsize=20)



    def brand_count(self):
        """
        count the number of tweets for each brand
        """
        sent_df = self.df
        df_counted = sent_df.groupby('brand').count()
        x = df_counted.text.index #brands x
        y = df_counted.text.values

        p = sns.barplot(x=x,y=y,data=df_counted, palette={'AstraZeneca':'#77ACF1','Moderna':'#F29191',
                                                          'Pfizer':'#F6D860'})
        p.set_title('Total Tweets Number for each Brand', fontsize=20)
        p.set_xlabel('Brands', fontsize=15)
        p.set_ylabel('Count', fontsize=15)



    def each_brand_count_side_by_side(self):
        """
        
        compare the number of tweets in percentage for each brand over time
        
        """
        sent_df = self.df

        #group by date and brand, count number of tweets
        grouped = sent_df.groupby(['date','brand']).count()

        #get percentage by grouping again using level
        grouped['percentage'] = grouped.groupby(level=0)['senti'].apply(lambda x: x*100/x.sum())                                                      
        sent_df = grouped.reset_index()

        brandcolor = [("AstraZeneca",'#F29191'),('Moderna','#77ACF1'),("Pfizer",'#F6D860')]
        fig, ax = plt.subplots(1,1,figsize=(12,8))

        sns.barplot(x=sent_df.date, y=sent_df.percentage, hue=sent_df.brand, 
                    palette={'AstraZeneca':'#77ACF1','Moderna':'#F29191',
                                                          'Pfizer':'#F6D860'})
        ax.tick_params(axis='x', rotation=45)
        ax.set_ylabel('Percentage', fontsize=15)
        ax.set_xlabel('Date', fontsize=15)
        plt.legend(bbox_to_anchor=(1.005,1), fontsize=12)
        plt.title('Percentage of Total Tweet Count for each Brand', fontsize=20)


    def ngram_bar(self, brand,num, senti=None, date=None, words_to_remove=[]): #strings
        """
        show bigram trigram freq in bar chart
        num= n most common
        words_to_remove=[]

        """

        sent_df = self.df
        string = ''
        rm=words_to_remove

        if date != None:
            tweet_N=sent_df[(sent_df['brand'] == f'{brand}')&(sent_df['text'].notnull())&
                           (sent_df['date']==date)].text.values
            if senti != None:
                tweet_N=sent_df[(sent_df['brand'] == f'{brand}')&(sent_df['text'].notnull())&
                   (sent_df['date']==date)&(sent_df['senti']==senti)].text.values
        elif date == None:
            tweet_N=sent_df[(sent_df['brand'] == f'{brand}')&(sent_df['text'].notnull())].text.values
            if senti != None:
                tweet_N=sent_df[(sent_df['brand'] == f'{brand}')&(sent_df['text'].notnull())&
               (sent_df['date']==date)&(sent_df['senti']==senti)].text.values


        if len(tweet_N) !=0:
            print(len(tweet_N))
            for sentence in tweet_N:
                string += (sentence + ' ')

            str_ls = string.split()
            t_ls=[]
            with_stop_ls=[]
            for word in str_ls:
                word = re.sub('[^a-zA-z0-9]', '',word) #only preserve alphanum for each word
                if len(word) !=0:
                    with_stop_ls.append(word)

                    if word.lower() not in rm: #get list without stop words

                            t_ls.append(word.lower())

            tup = FreqDist(t_ls).most_common(num)
            print(tup)

            bgs = nltk.bigrams(t_ls) #bigram() find bigrams 
            tgs = nltk.trigrams(t_ls) #trigram() find trigrams 
            fdis2 = nltk.FreqDist(bgs) #{():count}
            fdis3 = nltk.FreqDist(tgs) 
            tup2 = fdis2.most_common(int(num))
            tup3 = fdis3.most_common(int(num))
            print('\nbigrams',tup2)
            print('\ntrigrams',tup3)

            x2=[]
            y2=[]
            x3=[]
            y3=[]
            for t in tup2:
                x2.append(str(t[0])) #word
                y2.append(t[1]) #word count
            for t in tup3:
                x3.append(str(t[0])) #word
                y3.append(t[1]) #word count
            
            if date != None:
                thedate = f'on {date}'
            else:
                thedate=''
            fig,axes = plt.subplots(2,1,figsize = (15,15))
            sns.barplot(ax=axes[0],x=y2,y=x2, palette="colorblind")
            axes[0].set_title(f'Top {num} Bigrams for Tweets for Brand {brand} {thedate}', fontsize=20)
            axes[0].tick_params(axis = 'both', which = 'major', labelsize = 15)

            sns.barplot(ax=axes[1],x=y3,y=x3, palette="colorblind")
            axes[1].set_title(f'Top {num} Trigrams for Tweets for Brand {brand} {thedate}', fontsize=20)
            #ax.set_ylabel(fontsize=20)
            axes[1].tick_params(axis = 'both', which = 'major', labelsize = 15)
            plt.show()
            
    def wordcloud(self, brand, words_to_remove=[]):
        """
        WordCloud
        
        num = n most common
        brand: "Moderna", "AstraZeneca","Pfizer"

        
        """
        sentiments=['positive','negative','neutral']
        sent_df = self.df

        #words_to_remove.append(f'{brand.lower()}')
        wls=[]

        colormapls=['summer','gist_heat','winter']

        stopwordset = set(words_to_remove)

        """  if words_to_remove != None:
            stopwordset=set(words_to_remove)
        elif words_to_remove == None:
            stopwordset=set([])"""

        for i in range(len(sentiments)): #for each sentiment in a brand
            string = ''
            sentiment = sentiments[i]
            tweet_N=sent_df[(sent_df['brand'] == f'{brand}')&(sent_df['text'].notnull()) & 
                            (sent_df['senti']==sentiment)].text.values
            if len(tweet_N) !=0:
                for sentence in tweet_N:  #get all sentences as a string for this sentiment
                    string += (sentence + ' ')

                w = WordCloud(width = 800, height =800,
                                background_color ='white',
                                colormap=colormapls[i],
                                stopwords=stopwordset,
                                margin=20,
                                min_font_size = 10,
                                max_words=150).generate(string)

                wls.append(w) 

        fig, ax = plt.subplots(nrows=len(sentiments), figsize=(24,24), facecolor = None) 
        
        
        
        for i in range(len(sentiments)):
            ax[i].imshow(wls[i]) #w1,w2,w3
            ax[i].axis("off")
            ax[i].set_title(f'{brand}: {sentiments[i]} tweets', fontsize=30)
            plt.tight_layout(pad = 5)



In [None]:
rm = nltk.corpus.stopwords.words("english") 
viz.wordcloud('Moderna', words_to_remove=rm)