# Tweets analysis

This notebook is used to analyse tweets.

Note that this it requires already downloaded (and pre-processed) tweets. This can be done by using twitterDownloader.ipynb (and twitterFilterer.ipynb).

In [None]:
import codecs
import json
from collections import Counter

import nltk
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import TwitterProcessing

# nltk.download('vader_lexicon')
# nltk.download('stopwords')

from wordcloud import WordCloud
import string
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import warnings
import ssl

import pyLDAvis
from pyLDAvis import lda_model

from time import strptime
from datetime import datetime
from time import mktime

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [None]:
nltk.download('vader_lexicon')
nltk.download('stopwords')

In [None]:
# There are some warnings with LDA which we want to ignore
warnings.filterwarnings('ignore')

In [None]:
def process_tweet(text, tokenizer=nltk.tokenize.TweetTokenizer(), stemmer=nltk.stem.PorterStemmer(), stopwords=[]):
    """
    Perform tokenization, normalisation (lower case and stemming) and stopword and twitter keyword removal.

    @param text: tweet text
    @param tokenizer: tokenizer used.
    @param stemmer: stemmer used.
    @param stopwords: list of stopwords used

    @returns: a list of processed tokens
    """
    # covert all to lower case
    text = text.lower()

    # tokenize
    tokens = tokenizer.tokenize(text)

    # strip whitespaces before and after
    tokens = [token.strip() for token in tokens]

    # stem (we use set to remove duplicates)
    stemmed_tokens = set([stemmer.stem(tok) for tok in tokens])

    # remove stopwords, digits
    return [tok for tok in stemmed_tokens if tok not in stopwords and not tok.isdigit()]

In [None]:
def load_words(filename):
    """
    Loads the worlds of the file with the given name to a set.
    :param filename: The name of the file to load the words from.
    :return: A set of words loaded from the file.
    """
    words = []
    with codecs.open(filename, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            words.append(line.strip())

    return set(words)

In [None]:
def load_tweets(filename):
    """
    Loads the tweets from the file with the given name into an array of tweets.

    @param filename: The filename of the file to load the tweets from.

    @returns: An array of tweets.
    """
    tweets = []
    with open(filename, 'r') as f:
        for sLine in f:
            tweet = json.loads(sLine)
            tweets.append(tweet)
    return tweets

In [None]:
def get_hashtags(tweet):
    """
    Extracts the associated hashtags of tweet.

    @param tweet: The tweet, which is in the tweepy json format, and which we wish to extract its associated hashtags.

    @returns: list of hashtags (in lower case)
    """
    entities = tweet.get('entities', {})
    hashtags = entities.get('hashtags', [])

    return [tag['tag'].lower() for tag in hashtags]

In [None]:
# Name of the json file with the tweets to process
tweets_filename = '../data/coronation_2023_05_05_filtered.json'

# number of hashtags to display
hashtag_trash = 50

# number of most used words to display
words_trash = 50

# Load the tweets
tweets = load_tweets(tweets_filename)
print(len(tweets))

In [None]:
# Get the total number of tweets.
print("Total number of tweets: ", len(tweets))

first_created_at = tweets[0].get('created_at', '')
for tweet in tweets:
    created_at = tweet.get('created_at', '')
    if created_at is not None or first_created_at < created_at:
        last_created_at = created_at
print("First tweets date: ", first_created_at)

last_created_at = tweets[0].get('created_at', '')
for tweet in tweets:
    created_at = tweet.get('created_at', '')
    if created_at is not None and last_created_at > created_at:
        last_created_at = created_at
print("Last tweets date: ", last_created_at)

first_datetime = datetime.fromtimestamp(mktime(strptime(first_created_at, "%Y-%m-%dT%H:%M:%S.000Z")))
last_datetime = datetime.fromtimestamp(mktime(strptime(last_created_at, "%Y-%m-%dT%H:%M:%S.000Z")))

timediff =  first_datetime - last_datetime
print("Time difference: " + timediff.__str__())

tweets_per_second = len(tweets)/timediff.total_seconds()
print("Tweets per second: " + tweets_per_second.__str__())

seconds_per_week = 604800
estimated_max_timeframe_tweets = int(tweets_per_second * seconds_per_week)
print("Tweet estimation for one week: " + estimated_max_timeframe_tweets.__str__())

seconds_per_day = 86400
estimated_tweets_per_day = int(tweets_per_second * seconds_per_day)
print("Tweet estimation for one day " + estimated_tweets_per_day.__str__())

## Keyword and hashtag analysis

In [None]:
# Use a counter for counting hashtags
hashtag_counter = Counter()

# Add hashtags to counter
for tweet in tweets:
    hashtagsInTweet = get_hashtags(tweet)
    hashtag_counter.update(hashtagsInTweet)

# Print most used hashtags
for tag, count in hashtag_counter.most_common(hashtag_trash):
    print(tag + ": " + str(count))

In [None]:
# Tweet tokenizer to use
tweet_tokenizer = nltk.tokenize.TweetTokenizer()

# Use the punctuation symbols defined in string.punctuation
puncts = list(string.punctuation)

# Use stopwords from nltk and a few other twitter specific terms
twitter_stopwords = ['...', '…', '"', "'", '`', '‘', '“', '”',' ','re']
web_stopwords = ['uk','🇬','🇧','https','co']
all_stopwords = nltk.corpus.stopwords.words('english')\
                + twitter_stopwords\
                + web_stopwords \
                + puncts
stopwords = list(dict.fromkeys(all_stopwords))

# Use the popular Porter stemmer
tweet_stemmer = nltk.stem.PorterStemmer()

# The term frequency counter
word_counter = Counter()

In [None]:
tweet_text_processed = []
for tweet in tweets:
    tweet_text = tweet.get('text', '')

    # Tokenize, filter stopwords and get convert to lower case
    tokens = process_tweet(text=tweet_text, tokenizer=tweet_tokenizer, stemmer=tweet_stemmer, stopwords=stopwords)
    tweet_text_processed.append(' '.join(tokens))

    # Update counter
    word_counter.update(tokens)

# Print out most common terms
for term, count in word_counter.most_common(words_trash):
    print(term + ': ' + str(count))

## Sentiment Analysis

In [None]:
def vader_sentiment_analysis(tweet_filename, tweet_processor, preprocess_words=True):
    """
    Use Vader lexicons instead of a raw positive and negative word count.

    @param tweet_filename: name of input file containing a json formatted tweet dump
    @param tweet_processor: TweetProcessing object, used to pre-process each tweet.
    @param preprocess_words: Whether the words should be preprocessed before analysis or not.

    @returns: list of tweets, in the format of [date, sentiment]
    """
    # this is the vader sentiment analyser, part of nltk
    sent_analyser = SentimentIntensityAnalyzer()

    sentiments = []
    # open file and process tweets, one by one
    with open(tweet_filename, 'r') as f:
        for line in f:
            # each line is loaded according to json format, into tweet, which is actually a dictionary
            tweet = json.loads(line)

            try:
                tweet_text = tweet.get('text', '')
                tweet_date = tweet.get('created_at')

                # pre-process the tweet text
                if preprocess_words:
                    tokens = tweet_processor.process(tweet_text)
                else:
                    tokens = tweet_text

                # this computes the sentiment scores (called polarity score in nltk, but mean same thing essentially)
                # see lab sheet for what dSentimentScores holds
                if preprocess_words:
                    sentiment_scores = sent_analyser.polarity_scores(" ".join(tokens))
                else:
                    sentiment_scores = sent_analyser.polarity_scores(tokens)

                # save the date and sentiment of each tweet (used for time series)
                sentiments.append([pd.to_datetime(tweet_date), sentiment_scores['compound']])

            except KeyError as e:
                pass

    return sentiments

In [None]:
# call the TwitterProcessing python script
tweet_processor = TwitterProcessing.TwitterProcessing(tweet_tokenizer, stopwords)

In [None]:
# compute the sentiment
sentiments = vader_sentiment_analysis(tweets_filename, tweet_processor, False)

In [None]:
# Visualize the gathered data
series = pd.DataFrame(sentiments, columns=['date', 'sentiment'])
series.set_index('date', inplace=True)
series[['sentiment']] = series[['sentiment']].apply(pd.to_numeric)

series = series.resample('1h').sum() # mean
series.plot()
plt.title("Sentiment over time [per hour]")
plt.show()

In [None]:
# Visualize the gathered data
# pd.date_range("2022-10-06", "2022-10-07")

series = pd.DataFrame(sentiments, columns=['date', 'sentiment'])
series[['sentiment']] = series[['sentiment']].apply(pd.to_numeric)

series.set_index('date', inplace=True)


series = series.resample('15min').sum()
series.plot()
plt.title("Sentiment over time [per 15min]")
plt.show()

## Topic analysis

In [None]:
def display_topics(model, feature_names, words_to_print_per_topic):
    """
    Prints out the most associated words for each feature.

    @param model: lda model.
    @param feature_names: list of strings, representing the list of features/words.
    @param words_to_print_per_topic: number of words to print per topic.
    """

    # print out the topic distributions
    for topic_id, topic_distribution in enumerate(model.components_):
        print("Topic %d:" % (topic_id+1))
        print(" ".join([feature_names[i] for i in topic_distribution.argsort()[:-words_to_print_per_topic - 1:-1]]))

def display_word_crowd(model, feature_names):
    """
    Displays the word cloud of the topic distributions of the model.

    @param model: The LDA model.
    @param feature_names: list of strings, representing the list of features/words.
    """

    # normalize each row/topic to sum to one
    normalised_components = model.components_ / model.components_.sum(axis=1)[:, np.newaxis]

    # Display a wordcrowd for each topic
    for topic_id, topic_distribution in enumerate(normalised_components):
        word_probabilities = {feature_names[i] : wordProb for i, wordProb in enumerate(topic_distribution)}
        wordcloud = WordCloud(background_color='black')
        wordcloud.fit_words(frequencies=word_probabilities)
        plt.title('Topic %d:' % (topic_id+1))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.show(block=True)

In [None]:
# number of topics to discover (default = 10)
number_of_topics = 5

# maximum number of words to display per topic (default = 10)
words_to_display_per_topic = 10

# number of features/words to describe our documents
number_of_features = 1500

# extract a document-term matrix and the feature names using a CountVectorizer to do counting
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=number_of_features, stop_words=stopwords)
document_term_matrix = vectorizer.fit_transform(tweet_text_processed)

# extract the names of the features (= words)
feature_names = vectorizer.get_feature_names_out()

In [None]:
# train LDA model with the data
model = LatentDirichletAllocation(n_components=number_of_topics, max_iter=10, learning_method='online').fit(document_term_matrix)

In [None]:
# Print the most common words per topic.
display_topics(model, feature_names, words_to_display_per_topic)

In [None]:
# Visualize using pyLDAvis
panel = pyLDAvis.lda_model.prepare(model, document_term_matrix ,vectorizer, mds='tsne')
pyLDAvis.display(panel)

In [None]:
# Display the wordclouds
display_word_crowd(model, feature_names)