In [None]:
''' LIBRARY IMPORT '''
from datasets import load_dataset
import pandas as pd
import nltk
import regex as re
import plotly.express as px
import seaborn as sb

In [None]:
''' DATASET IMPORT ''''

data = pd.DataFrame(" YOUR DATA HERE , READ FROM FILE PATH")
data_list = data[' TEXT COLUMN '].to_list()

In [None]:
''' DATASET CLEANING '''
  ## To call and instantiate the stopwords:
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

  ## To call and instantiate the snowball stemmer ☃️
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english',ignore_stopwords=True)

In [None]:
''' WORD FREQUENCIES COUNTING '''


## Instantiating our dictionary to count the words in
word_counter = {}


## Iterating through every tweet
for tweet in data_list:

  ## Identifying each word by splitting the tweet on blank or white spaces.
  words = tweet.split(" ")

  ## Iterating through each word and filtering out actual alphabetical characters using Regex
  for word in words:
    matches = re.findall('\D*',word)
    word = matches[0]

    ## Stemming the word using the porter stemmer
    stem = stemmer.stem(word)

    ## Counting the words
    ## First, checking if the word has already been counted in the dictionary, and if not creating a new value for it
    if len(stem) != 0:
        if stem not in word_counter.keys():
            word_counter[stem] = 1

    else:
        word_counter[stem] = word_counter[stem] + 1

In [None]:
''' EXAMINING THE CREATED DICTIONARY '''

## First, let's see how many unique words are in the vocabulary

unique_words = len(word_counter.keys())
print ("The number of the unique words in the vocabulary is:",unique_words)

In [None]:
''' CREATING A FILTERED WORD COUNTER '''

  ## Creating a storage dictionary for the words with the highest frequencies
filtered_word_counter = {}

## Iterating through every word and checking to see if there is over a certain amont of words
for key in word_counter.keys():
    if word_counter[key] > 10:  ## This can be changed based on any threshold. Setting this to 10 is an arbitrary decision.

    ## Checking to see if the word is not a stopword
        if key not in stopwords:

          ## Checking to see if the word has signficance
          if len(key) >2:
            filtered_word_counter[key] = word_counter[key]
            
''' CREATING A FILTERED DATA FRAME'''
## To know what parameters are avaiable this can be further explored on the pandas documentation
## It requires some manipulation to ensure that the data is accurately represented.
filtered_words = pd.DataFrame(filtered_word_counter,index=['value']).transpose().reset_index().sort_values(by='value',ascending=False)


''' CREATING A COLUMN FOR PERCENTAGE '''

## Creating a value for how many instances of the words were present in the corpus
filtered_corpus_size = filtered_words['value'].agg('sum')

## Let's look to see what percentage of the corpus each word composes of. To do so, we can use a lambda function.
filtered_words['percentage'] = filtered_words['value'].apply(lambda x: x/filtered_corpus_size )

## Storing the list of words, in greatest to least order
words = filtered_words['index'].to_list()

In [None]:
''' READING SOME OF THE WORDS '''

## We can look at the keys of the filtered word counter to see what the highest frequency words are.
## The output of keys is in order they were added to the dictionary, not the highest frequency.
filtered_word_counter.keys()

In [None]:
## Looking at the top ten words using Pandas head feature
filtered_words.head(10)

In [None]:
''' DOWNLOADING THE VADER LEXICON AND SENTIMENT ANALYZER '''

nltk.download('vader_lexicon')
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

''' ANALYZING FOR SENTIMENT '''

## Creating a storage list for each end of the emotion scale
neg = []
neu = []
pos = []
compound = []

## Iterating through the words to generate a sentiment score
for word in words:
    sentiment = sia.polarity_scores(word)
    
    ## Storing the current scores in the lists
    curr_neg = sentiment['neg']
    curr_neu = sentiment['neu']
    curr_pos = sentiment['pos']
    curr_compound = sentiment['compound']
    
    neg.append(curr_neg)
    neu.append(curr_neu)
    pos.append(curr_pos)
    compound.append(curr_compound)
    
    
''' EXPANDING THE DATAFRAME '''

## Applying each of the lists to a new column in the dataframe
filtered_words['neg'] = neg
filtered_words['neu'] = neu
filtered_words['pos'] = pos
filtered_words['compound'] = compound


''' SORTING THE DATAFRAME '''

## Using the filtered_words dataframe to sort the values by compound, so the most positive words will occur first.
compound_sorted = filtered_words.sort_values(by='compound',ascending=False)
compound_sorted.head()

In [None]:
''' AGGREGATING UPON SENTIMENT VALUES '''
neg_sum = filtered_words['neg'].agg('sum')
pos_sum = filtered_words['pos'].agg('sum')
neu_sum = filtered_words['neu'].agg('sum')
compound_sum = filtered_words['compound'].agg('sum')

## Seeing what is the value of words that have a negative, neutral, or positive score. All words will have a compound score
neg_query = filtered_words.query('neg != 0')
neg_value = neg_query['value'].agg('sum')
neg_perc = neg_value / filtered_corpus_size

pos_query = filtered_words.query('pos != 0')
pos_value = pos_query['value'].agg('sum')
pos_perc = pos_value / filtered_corpus_size

neu_query = filtered_words.query('neu != 0')
neu_value = neu_query['value'].agg('sum')
neu_perc = neu_value / filtered_corpus_size

''' DATAFRAME CREATION '''
calculations = {'neg freq':neg_value,'neg sia':neg_sum,'neg perc':neg_perc,
                'neu freq':neu_value,'neu sia':neu_sum,'neu perc':neu_perc,
                'pos freq':pos_value,'pos sia':pos_sum,'pos perc':pos_perc,}

sentiment_calculations = pd.DataFrame.from_dict(calculations,orient='index')
percentages = pd.DataFrame.from_dict({'neu perc':neu_perc,'pos perc':pos_perc,'neg perc':neg_perc},orient='index')

In [None]:
''' VISUALIZATION '''

In [None]:
''' 1. WORD FREQUENCY DISTIRBUTION '''

ax = sb.barplot(filtered_words,x='index',y='value')
ax.set(title='Filtered Word Frequency Distribution')
ax.set(xticklabels=[])
ax.tick_params(bottom=False)
ax;

In [None]:
''' 2. WHAT ARE THE MOST POSITVE WORDS? '''
fig = px.bar(compound_sorted.query('compound != 0'),x='index', y="compound",title = 'Highest Frequency COVID-19 Tweets: Compound Word Scores')
fig.show()

In [None]:
''' 3. WHAT IS THE PERCENTAGE OF EMOTION IN THE HIGHEST FREQUENCY WORDS? '''
fig = px.funnel(percentages, x=0,y=['neu perc','pos perc','neg perc'],title='Highest Frequency COVID-19 Tweets: Percentage of Sentiment')
fig