In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd
from textblob import TextBlob
from wordcloud import WordCloud
import re
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors

import sys
import os
import nltk

from nltk.corpus import stopwords
import string
from PIL import Image
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [None]:
pd.set_option('display.max_colwidth', 1000)

**EXTRACTING TWEETS**

In [None]:
# Creating list to append tweet data to

attributes_container = []
search_query = "Zelensky since:2023-03-25 until:2023-03-27 lang:en"
mode_param = sntwitter.TwitterSearchScraperMode.TOP

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper(search_query, mode = mode_param).get_items()):
    if i>1000:
        break
    attributes_container.append([tweet.rawContent])
    

df = pd.DataFrame(attributes_container, columns=["Tweet"])

In [None]:
display(df.to_string())

**CLEANING TEXT FOR ANALYSIS**


In [None]:
def cleantext(text):
    
    text = re.sub(r'https?:\/\/\S+', '', text) #links
    text = re.sub(r'RT[\s]+', '', text) #removed RT
    text = re.sub(r'#[A-Za-z0-9_]+', '', text) #removed '#'
    text = re.sub(r'@[A-Za-z0-9]+', '', text) #removed mentions
    text = re.sub(r'[^0-9A-Za-z \t]+', '', text) #removed non alphanumeric
    text = text.lower() # convert text to lowercase

    return text

def remove_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"  # other miscellaneous symbols
                               u"\U000024C2-\U0001F251" 
                               "]+", flags=re.UNICODE)
    # Remove emojis from the text
    text_without_emojis = emoji_pattern.sub(r'', text)
    return text_without_emojis

In [None]:
df['Tweet'] = df['Tweet'].apply(cleantext)
df['Tweet'] = df['Tweet'].apply(remove_emojis)

In [None]:
#display(df.to_string())

**SENTIMENT ANALYSIS**

In [None]:
#using VADER
def getSentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    vs = analyzer.polarity_scores(text)
    compound_score = vs['compound']
    if compound_score < 0:
        return 'Negative'
    elif compound_score == 0:
        return 'Neutral'
    else:
        return 'Positive'

tweets = df['Tweet']

sentiment_df = pd.DataFrame()

for post in tqdm(tweets):
    sentiment = getSentiment(post)
    row = pd.Series([sentiment, post], index=['Tweet_Sentiment', 'Tweet'])
    sentiment_df = pd.concat([sentiment_df, row.to_frame().T])

sentiment_df.reset_index(drop=True, inplace=True)
sentiment_df = sentiment_df.rename_axis('Tweet_No')
print(sentiment_df.head())

In [None]:
#USING TEXTBLOB
# def getPolarityScore(text):
#     return TextBlob(text).sentiment.polarity 

# def getSentiment(polarity_score):
#     if polarity_score < 0:
#          return 'Negative'
#     elif polarity_score == 0:
#          return 'Neutral'
#     else:
#          return 'Positive'

# tweets = df['Tweet']

# sentiment_df = pd.DataFrame()

# for post in tqdm(tweets):
#     polarity = getPolarityScore(post)
#     sentiment = getSentiment(polarity)
#     row = pd.Series([round(polarity, 2), sentiment, post], index=['Tweet_Polarity', 'Tweet_Sentiment', 'Tweet'])
#     sentiment_df = pd.concat([sentiment_df, row.to_frame().T])

# sentiment_df.reset_index(drop=True, inplace=True)
# sentiment_df = sentiment_df.rename_axis('Tweet_No')
# print(sentiment_df.head())

**VISUALIZATION**

Bar graph

In [None]:

# plt.figure(figsize=(8, 6))

# sns.countplot(x="Tweet_Sentiment", data=sentiment_df)
# plt.xlabel("Count per Sentiment")
# plt.title("Count of sentiment in Dataset")
# plt.show()

Pie Chart

In [None]:
counts = sentiment_df['Tweet_Sentiment'].value_counts() 

colors = ['red', 'green', 'grey']
counts.plot.pie(autopct='%.0f%%', colors=colors)
plt.axis('equal')
plt.show()

**Word Popularity using N-gram**

tokenizing, removing the stop words, and stemming on previously cleaned texts

In [None]:
pop_list = df.copy()
def remove_punct(text):
    text = "".join([char for char in text if
                    char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    text = text.strip()  #ADDED
    return text
 
 
pop_list['punct'] = pop_list['Tweet'].apply(
  lambda x: remove_punct(x))
 
# Applying tokenization
def tokenization(text):
    text = re.split('\W+', text)
    text = [t.strip() for t in text] #ADDED
    return text
 
 
pop_list['tokenized'] = pop_list['punct'].apply(
    lambda x: tokenization(x.lower()))
 
# Removing stopwords
#stopword = nltk.corpus.stopwords.words('english')

my_stopwords = ['ukrainian', 'go','president','volodymyr']
stopwords = nltk.corpus.stopwords.words('english')
final_sw = my_stopwords + stopwords
#stopwords.extend(my_stopwords)
def remove_stopwords(text):
    text = [word for word in text if
            word not in final_sw]
    return text
 
pop_list['nonstop'] = pop_list['tokenized'].apply(
  lambda x: remove_stopwords(x))
 
# Applying Stemmer
ps = nltk.PorterStemmer() 
 
def stemming(text):
    text = [ps.stem(word) for word in text]
    return text
 
pop_list['stemmed'] = pop_list['nonstop'].apply(
  lambda x: stemming(x))
 
pop_list.head()

**Most used words**

In [None]:
def cleantext(text):
    text = remove_punct(text)
    text = tokenization(text.lower())
    text = remove_stopwords(text)
    text = stemming(text) 
    return text

In [None]:
# Applying Countvectorizer
countVectorizer = CountVectorizer(analyzer=cleantext)
countVector = countVectorizer.fit_transform(pop_list['Tweet'])
count_vect_df = pd.DataFrame(
    countVector.toarray(),
  columns=countVectorizer.get_feature_names_out())
count_vect_df.head()
 
# Most Used Words
count = pd.DataFrame(count_vect_df.sum())
countdf = count.sort_values(0,
                            ascending=False).head(20)
countdf = countdf.rename(columns={0: 'Word Count'})
countdf[0:16]



In [None]:
plt.bar(countdf.index, countdf['Word Count'])

plt.title("Most Used Words")
plt.xlabel("Words")
plt.ylabel("Frequency")
plt.xticks(rotation=45, ha='right', fontsize=8)
plt.figure(figsize=(8, 6))

plt.show()

**Bigram and Trigram**

In [None]:
def get_top_n_gram(corpus, ngram_range, n=None):
    vec = CountVectorizer(ngram_range=ngram_range,
                          stop_words=final_sw).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx])
                  for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]
 
# n2_bigram
n2_bigrams = get_top_n_gram(pop_list['Tweet'], (2, 2), 20)
plt.figure(figsize=(10, 6),
           dpi=600)  # Push new figure on stack
sns_plot = sns.barplot(x=1, y=0, data=pd.DataFrame(n2_bigrams))
plt.savefig('bigram.jpg')  # Save that figure
# n3_trigram
n3_trigrams = get_top_n_gram(pop_list['Tweet'], (3, 3), 20)
 
plt.figure(figsize=(8, 6),
           dpi=600)  # Push new figure on stack
sns_plot = sns.barplot(x=1, y=0, data=pd.DataFrame(n3_trigrams))
plt.savefig('trigram.jpg')  # Save that figure