# Twitter Sentiment Analysis

## Imports

In [1]:
import pandas as pd
import numpy as np
import re
import tweepy
from tweepy import OAuthHandler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('stopwords')
nltk.download("wordnet")
nltk.download('vader_lexicon')

stopwords_set = set(stopwords.words("english"))

extra_stopwords = ("rt", "url", "ul")

for word in extra_stopwords:
    stopwords_set.add(word)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\avanoostveen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\avanoostveen\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\avanoostveen\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Main Class: Scrape, Clean, Tokenize and Lemmatize Tweets

In [2]:
import tweepy
import pandas as pd
from tweepy import OAuthHandler
class TwitterMain(object): 
    def __init__(self):
        # Access Credentials 
        consumer_key = 'qomm8PtOchZtPv3u0Qf7itE4w'
        consumer_secret = 'YQya6zGND5ULBePviyKWxIFRh19vqaMyhIp0f67r23aEmcLqPs'
        access_token = '958414889646788609-QKHp30udwA1hlzNxp3KAvxTFWwGgZoF'
        access_token_secret = '2GcwRYoWRs6vQL8DO9cxxkydPEpRjNJ1OZSznBuTFfD5U'
        
        try: 
            # OAuthHandler object 
            auth = OAuthHandler(consumer_key, consumer_secret) 
            # set access token and secret 
            auth.set_access_token(access_token, access_token_secret) 
            # create tweepy API object to fetch tweets 
            self.api = tweepy.API(auth, wait_on_rate_limit=False, wait_on_rate_limit_notify=True)
            
        except tweepy.TweepError as e:
            print(f"Error: Twitter Authentication Failed - \n{str(e)}") 

    # Function to fetch tweets
    def get_tweets(self, query, maxTweets = 1000): 
        # empty list to store parsed tweets 
        sleep_on_rate_limit=False
        tweets = [] 
        sinceId = None
        max_id = -1
        tweetCount = 0
        tweetsPerQry = 100
        
        for word in query.split():
            stopwords_set.add(word.lower())
        
        while tweetCount < maxTweets:
            try:
                if (max_id <= 0):
                    if (not sinceId):
                        new_tweets = self.api.search(q=query, count=tweetsPerQry, tweet_mode='extended')
                    else:
                        new_tweets = self.api.search(q=query, count=tweetsPerQry, tweet_mode='extended',
                                                since_id=sinceId)
                else:
                    if (not sinceId):
                        new_tweets = self.api.search(q=query, count=tweetsPerQry, tweet_mode='extended',
                                                max_id=str(max_id - 1))
                    else:
                        new_tweets = self.api.search(q=query, count=tweetsPerQry, tweet_mode='extended',
                                                max_id=str(max_id - 1),
                                                since_id=sinceId)
                if not new_tweets:
                    print("No more tweets found")
                    break
                    
                for tweet in new_tweets:
                    if tweet.lang == 'en':
                        parsed_tweet = {} 
                        parsed_tweet['tweets'] = tweet.full_text

                        # appending parsed tweet to tweets list 
                        if tweet.full_text.startswith("RT @"):
                            parsed_tweet['tweets'] = tweet.retweeted_status.full_text
                            
                            # if tweet has retweets, ensure that it is appended only once 
                            if parsed_tweet not in tweets: 
                                tweets.append(parsed_tweet) 
                        else: 
                            tweets.append(parsed_tweet) 
                        
                        tweetCount = len(tweets)
                        max_id = new_tweets[-1].id

            except tweepy.TweepError as e:
                print("Tweepy error : " + str(e))
                break
        
        return pd.DataFrame(tweets)
    
    # Removes punctuation marks from textual input
    def remove_pattern(text, pattern_regex):
        r = re.findall(pattern_regex, text)
        for i in r:
            text = re.sub(i, '', text)

        return text

    # Removes emojis from textual input
    def remove_emoji(text):
        emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
        
        return emoji_pattern.sub(r'',text)

    # Removes url from textual input
    def remove_URL(text):
        url = re.compile(r'https?://\S+|www\.\S+')
        
        return url.sub(r'URL',text)
    
    # Removes unwanted tokens from Tweets
    def clean_tweets(tweets_df):

        cleaned_tweets = []

        tweets_df['tweets'] = np.vectorize(TwitterMain.remove_pattern)(tweets_df['tweets'], "\)\(.[\w]*: | *RT*")
        tweets_df['tweets'] = np.vectorize(TwitterMain.remove_emoji)(tweets_df['tweets'])
        tweets_df['tweets'] = np.vectorize(TwitterMain.remove_URL)(tweets_df['tweets'])

        for index, row in tweets_df.iterrows():
            x = [word for word in row.tweets.split() if "@" not in word and word not in stopwords_set]
            cleaned_tweets.append(' '.join(x))

        return cleaned_tweets

    # Tokenizes each Tweet
    def tokenize(tweets_df):
        # Tokenization
        tokenized_tweets = tweets_df['tweets'].apply(lambda x: x.split())

        # Finding Lemma for each word
        word_lemmatizer = WordNetLemmatizer()
        tokenized_tweets = tokenized_tweets.apply(lambda x: [word_lemmatizer.lemmatize(i) for i in x])

        #joining words into sentences (from where they came from)
        for i, tokens in enumerate(tokenized_tweets):
            tokenized_tweets[i] = ' '.join(tokens)

        return tokenized_tweets
    
    # Performs sentiment analysis over given textual input
    def sentiment(text):
        sid = SentimentIntensityAnalyzer()
        polarity_scores = sid.polarity_scores(text)
        
        if polarity_scores['compound'] == 0:
            return 'Neutral', polarity_scores['compound']
        elif polarity_scores['neg'] > polarity_scores['pos']:
            return 'Negative', polarity_scores['compound']
        else:
            return 'Positive', polarity_scores['compound']
    
    # Returns a list of the frequencies per word in a search query
    def frequencies(df):
        all_words = []
        for line in list(df['tweets']):
            words = line.split()
            for word in words:
                all_words.append(word.lower())

        # create a word frequency dictionary
        return Counter(all_words)

# Dashboard

## Create App

In [9]:
###############################################----IMPORTS & SETUP---###############################################


import dash
import dash_table
import dash_core_components as dcc
import dash_html_components as html
import plotly.express as px
import pandas as pd

# Use external stylesheet from github for layout
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

# Create application
app = dash.Dash(__name__, external_stylesheets=external_stylesheets)


###################################################----METHODS---###################################################


# Scrapes Twitter and performs sentiment analysis
def fetch_Tweets(value):
    
    # Fetch tweets based on search query (value)
    dash_tweets = TwitterMain.get_tweets(TwitterMain(), str(value))
    
    # Store full Tweets in a separate column
    dash_tweets['Full Tweet']=dash_tweets['tweets']
    
    # Make sure all Tweets are string types
    dash_tweets['tweets']=dash_tweets['tweets'].apply(str)

    # Clean Tweets (Remove stopwords, punctuation marks, usernames, emojis and url's)
    dash_tweets['tweets'] = dash_tweets['tweets'].str.replace("[^a-zA-Z\s]", "")
    dash_tweets['tweets'] = TwitterMain.clean_tweets(dash_tweets)

    # Tokenize and Lemmatize Tweets
    dash_tweets['tweets'] = TwitterMain.tokenize(dash_tweets)
    
    # Make sentiment (pos-neg-neutral) and sentiment score (-1 to 1) columns
    dash_tweets["Sentiment"]=""
    dash_tweets["Sentiment Score"]=""

    counter = 0
    # Perform sentiment analysis and fill columns with sentiment (pos-neg-neutral) and sentiment score (-1 to 1)
    for text in dash_tweets["tweets"]:
        dash_tweets["Sentiment"][counter], dash_tweets["Sentiment Score"][counter] = TwitterMain.sentiment(text)
        counter +=1
    
    # Derive frequency of each word and transform into a sorted dataframe
    wordfreq = TwitterMain.frequencies(dash_tweets)
    df_wf = pd.DataFrame.from_dict(wordfreq, orient='index').reset_index()
    df_wf.columns = ['Word', 'Frequency']
    df_wf = df_wf.sort_values(by = "Frequency", ascending = False).reset_index(drop=True)
    
    # Create separate dataframe containing top 25 most positive and most negative tweets
    top_tweets_df = pd.DataFrame()
    top_tweets_df["Positive"] = dash_tweets.sort_values(by = "Sentiment Score", ascending = False)["Full Tweet"][:25].reset_index(drop = True)
    top_tweets_df["Negative"] = dash_tweets.sort_values(by = "Sentiment Score")["Full Tweet"][:25].reset_index(drop=True)
    
    return dash_tweets, df_wf, top_tweets_df    

# Dictionary containing the values in the python dropdown menu
options_drop=[
#             {'label': 'KPMG', 'value': 'KPMG_NL'},
            {'label': 'ING', 'value': 'ING_BANK'},
            {'label': 'Shell', 'value': 'SHELL_Nederland'},
#             {'label': 'Aegon', 'value': 'AEGON'}
#             {'label': 'Akzo Nobel', 'value': '#AKZO'},
#             {'label': 'ASML', 'value': '#ASML'}
            {'label': 'Adyen', 'value': 'ADYEN'}
#             {'label': 'ASR', 'value': '#ASRNL'},
#             {'label': 'Heineken', 'value': '#HEIA'}
#             {'label': 'Ahold', 'value': '#Ahold'},
#             {'label': 'Philips', 'value': 'Philips'}
#             {'label': 'Nationale Nederlanden', 'value': '#NN'},
#             {'label': 'Unilever', 'value': 'Unilever'}
        ]

# Create dictionaries to store Tweets, wordfrequencies and top 25 most positive and negative Tweets per search query
tweets = {}
tweets_wf = {}
top_tweets = {}

# Fill the dictionaries with Tweets, wordfrequencies and top 25 most positive and negative Tweets per search query
for company in options_drop:
    tweets[company["value"]], tweets_wf[company["value"]], top_tweets[company["value"]] = fetch_Tweets(company["value"])

    
#############################################----DASHBOARD LAYOUT---#############################################


app.layout = html.Div(children=[
    # Title
    html.H1(children='Twitter Sentiment Analysis'),
    
    # Introduction text
    dcc.Markdown('''
        In this questionnaire, the usability of a tool will be put to the test.
        
        The functionalities of this tool are the following:
         - It scrapes Twitter for Tweets containing a specific search query.
         - It performs a Sentiment Analysis over these Tweets. 
         - It returns a pie chart containing the number of positive, negative and neutral Tweets that were returned.
         - It returns a histogram containing the word frequency of the most common words.
         - It returns the top positive and negative Tweets per search query.
         
        The goal of this tool is to support auditors in assessing the risk profile around a company. Specifically, this
        tool is designed to help identify impairment triggers, based on the sentiment found in Tweets about a certain 
        company.
        
        To test whether this tool could be useful for auditors in the process of identifying impairment triggers, it is 
        asked of you, as an expert in this field, to assess the applicability of the information returned by this tool in
        the identification process of impairment triggers. This will be done through means of a questionnaire, that can help
        determine how well this tool aids in identifying impairment triggers. Each of the questions contains an aspect of
        the tool
        
        In order to validate the usability of this tool with regards to certain functionalities it is asked of you to rate 
        the following applications on a scale of 1 to 10, with 1 indicating not usable and 10 indicating extremely useful.
        Please repeat this process for all companies in the dropdown list.
        
        Furthermore, for each of the spplications that you are asked to rate, it is asked of you to leave a comment, 
        indicating what you think this tool is missing and what aspect of this tool is most useful.
    '''),
    
    # Subtitle for dropdown menu
    html.H2(children='Select a company:'),
    
    # Dropdown menu
    dcc.Dropdown(
        id='fig_dropdown',
        options=options_drop,
        value = "ING_BANK"
    ),
    
    # Subtitle for pie chart
    html.H2(children='Pie chart: positive, negative and neutral Tweets'),
    
    # Pie chart
    dcc.Graph(id="pie-chart"),
    
    # Subtitle for histogram
    html.H2(children='Histogram of most common words'),
    
    # Histogram
    dcc.Graph(id='histogram'),
    
    # Subtitle for "Top Tweets"
    html.H2(children='Most positive and most negative Tweets'),
    
    # Data table containing top 25 most positive and most negative Tweets
    dash_table.DataTable(
        id='table',
        style_cell=
            {
                'textAlign': 'center',
                'whiteSpace': 'normal',
                'height': 'auto',
            },
        style_data_conditional=[
            {
                'if': {'row_index': 'odd'},
                'backgroundColor': 'rgb(248, 248, 248)'
            }
        ],
        style_header={
            'backgroundColor': 'rgb(230, 230, 230)',
            'fontWeight': 'bold'
        },
        data = []
    )

],
        # Styling of dashboard
        style={'marginLeft': 25, 'marginRight': 25, 'marginTop': 25, 'marginBottom': 25,
           'backgroundColor':'#F7FBFE',
           'border': 'thin lightgrey', 
           'padding': '10px 5px 10px 5px'
})


#########################################----DASHBOARD INPUT & OUTPUT---#########################################


@app.callback([
    # Returns a pie chart using user input from the dashboard
    dash.dependencies.Output(component_id = 'pie-chart', component_property = 'figure'),
    
    # Returns a histogram using user input from the dashboard
    dash.dependencies.Output(component_id = 'histogram', component_property = 'figure'),
    
    # Returns data for the data table using user input from the dashboard
    dash.dependencies.Output(component_id = 'table', component_property = 'data'),
    
    # Returns columns for the data table using user input from the dashboard
    dash.dependencies.Output(component_id = 'table', component_property = 'columns')],
    
    # Fetches user input from the dropdown component from the dashboard to use in the function below
    dash.dependencies.Input(component_id = 'fig_dropdown',  component_property = 'value'))

# Uses the user input from the dashboard and returns the output as described above
def update_dashboard(value):

    # Creates pie chart based on user input
    pie = px.pie(tweets[value], values = tweets[value]["Sentiment"].value_counts(), 
                        names = tweets[value]["Sentiment"].unique(),
                        color = tweets[value]["Sentiment"].unique(),
                        color_discrete_map={'Positive':'royalblue',
                                            'Negative':'#ff0000',
                                            'Neutral':'#bbc4cc'})
    
    # Creates histogram based on user input
    hist = px.histogram(tweets_wf[value][1:26], x="Word", y = "Frequency", title = "Word Frequency", nbins = 20)
    
    # Creates a dictionary containing data for the data table based on user input
    data = top_tweets[value].to_dict("records")
    
    # Creates a list of columns based on user input
    columns = [{"name": i, "id": i} for i in top_tweets[value].columns]
    
    # Return the pie chart, histogram, data and columns
    return pie, hist, data, columns


#########################################----RUN APPLICATION---#########################################


if __name__ == '__main__':
    app.run_server(debug=False)

No more tweets found
No more tweets found
No more tweets found
Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
127.0.0.1 - - [28/Jan/2021 10:47:31] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [28/Jan/2021 10:47:33] "[37mGET /_dash-layout HTTP/1.1[0m" 200 -
127.0.0.1 - - [28/Jan/2021 10:47:33] "[37mGET /_dash-dependencies HTTP/1.1[0m" 200 -
127.0.0.1 - - [28/Jan/2021 10:47:33] "[37mPOST /_dash-update-component HTTP/1.1[0m" 200 -
