In [None]:
# Load datasets
import pandas as pd
import csv

data = 'Full Dataset - clean_data.csv'
df = pd.read_csv(data, sep=',')
# df = df[df.followers < 3000000] # drop the account w/ 3mil followers

In [None]:
# Correlation (Heatmap)
import plotly.express as px

num_features = ['followers','likes', 'replies', 'retweets', 'quoteTweets']
corr = df[num_features].corr(method = 'spearman') # Use Spearman for non-linear corr: (method='spearman')

fig = px.imshow(corr,
                color_continuous_scale='RdBu',
                zmin=-1,
                zmax=1,
                labels=dict(x='Features', y='Features', color='Correlation'),
                x=corr.columns,
                y=corr.columns)
fig.show()

Figure 10: This heatmap displays the correlation between the number of followers to the amount of likes, replies, retweets, and quote tweets it received from other users. Spearman's method was used due to the fact that the data was not normally distributed. The plotly library was used to plot the heatmap.

# Lower Casing and Stop Word Removal
For lower casing, we simply used the `.appl(str.lower)` function to the data frame's tweet column.

Before tokenizing, we decided to remove the stop words first. For this, we imported the English stop words from the NLTK library. We then created the function `remove_stopw` to remove the stop words from each tweet in the data frame.  

In [None]:
# Lower casing and removal of stop words
import string                                               # for removal of punctuation marks
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')

# Lower casing
df['tweetNLP'] = df['translatedTweet'].apply(str.lower)

# Removal of stop words
stop_words = set(stopwords.words('english')) # for removal of punctuation marks
punct_str = string.punctuation + '“”—'                      # for removal of punctuation marks
def remove_stopw(x, punctuation):
  x = x.translate(str.maketrans('', '', punctuation))       # to remove punctuation
  clean_text = " ".join([word for word in str(x).split() if word not in stop_words])
  return clean_text
df['tweetNLP'] = df['tweetNLP'].apply(lambda x : remove_stopw(x, punct_str))
df['tweetNLP'].head(20)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0     600 per kilo edi wag bumili ng 1 kilo solution...
1     ang tawag sa uri ng statement na ito ay sandro...
2     peso weak dollar really strong uh 😥 httpstco42...
3                         peso strong dollar weak na ba
4     peso weak peso weak peso weak dollar strong sa...
5     philippines economic crisis concerts peso isnt...
6     peso rises vs dollar php got stronger usd fall...
7                               peso weak dollar strong
8                          peso weak weak dollar strong
9                             peso weak dollar strong 🥲
10    giving peso weak peso weak peso weak dollar st...
11                            peso weak dollar strong f
12          peso weak weak peso weak dollar strong 💪💪🥴🥴
13    hindi nga daw weak ung peso strong nga lang da...
14              peso weak peso weak dollar strong 🥴🫢🫣🤔🥱
15    baguhin dapat nila wording due weakening peso ...
16          peso weak peso weak peso weak dollar strong
17    grabe nanaman kayo ang sasama niyo alam ni

# Tokenizing

For the tokenization of our dataset, we used the TweetTokenizer from the NLKT library on our 'tweet' column to better preserve the formats of the tweets.

In [None]:
# Tokenizing using TweetTokenizer
nltk.download('punkt')
twtternizer = TweetTokenizer()
df['tweetNLP'] = df['tweetNLP'].apply(twtternizer.tokenize)
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,ID,keywords,acountType,joinedDate,following,followers,location,tweet,tweetType,datePosted,contentType,likes,replies,retweets,quoteTweets,quotedTweet,tweetNLP
0,1,"peso, weak, dollar, strong",Identified,04/10,360,558,Philippines,600 per kilo? Edi wag bumili ng 1 kilo.\nSolut...,Text,12/29/22 17:46,"Rational, Emotional",10,0,5,0,,"[600, per, kilo, edi, wag, bumili, ng, 1, kilo..."
1,2,"peso, weak, dollar, strong",Anonymous,06/10,573,1170,Unidentified,"Ang tawag sa uri ng statement na ito ay ""Sandr...","Text,Image",12/24/22 21:57,Rational,14,1,0,0,,"[ang, tawag, sa, uri, ng, statement, na, ito, ..."
2,3,"peso, weak, dollar, strong",Anonymous,05/20,628,535,Unidentified,"The Peso is not weak, but the dollar is reall...","Text,Image",12/12/22 13:03,Emotional,1,1,0,0,,"[peso, weak, dollar, really, strong, uh, 😥, ht..."
3,4,"peso, weak, dollar, strong",Anonymous,09/22,78,0,Philippines,Is the peso strong because the dollar is weak ...,"Text,Quote",12/7/22 22:34,Emotional,1,0,0,0,https://twitter.com/PhilstarNews/status/160041...,"[peso, strong, dollar, weak, na, ba]"
4,5,"peso, dollar, sandro",Identified,04/10,157,84,Philippines,“the peso is not weak because the peso is weak...,"Text,Quote",12/6/22 21:09,Rational,0,0,0,0,https://twitter.com/cnnphilippines/status/1599...,"[peso, weak, peso, weak, peso, weak, dollar, s..."


# Stemming and Lemmatization

The SnowballStemmer was from NLTK library was for stemming the tweets, and WordNetLemmatizer from the same library was used for lemmitzation.

Please note that stemming may cause the word to be incorrect (e.g., studies may turn into studi). Lemmatization on the other hand reduces the word to its base form correctly (e.g., programming becomes program), but it may not work on all the words (e.g., programmer may stay as programmer rather than transform into program).



In [None]:
# Lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
  return [lemmatizer.lemmatize(w, pos='v') for w in text]

df['tweetNLP'] =df.tweetNLP.apply(lemmatize_text)

df.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,ID,keywords,acountType,joinedDate,following,followers,location,tweet,tweetType,datePosted,contentType,likes,replies,retweets,quoteTweets,quotedTweet,tweetNLP
0,1,"peso, weak, dollar, strong",Identified,04/10,360,558,Philippines,600 per kilo? Edi wag bumili ng 1 kilo.\nSolut...,Text,12/29/22 17:46,"Rational, Emotional",10,0,5,0,,"[600, per, kilo, edi, wag, bumili, ng, 1, kilo..."
1,2,"peso, weak, dollar, strong",Anonymous,06/10,573,1170,Unidentified,"Ang tawag sa uri ng statement na ito ay ""Sandr...","Text,Image",12/24/22 21:57,Rational,14,1,0,0,,"[ang, tawag, sa, uri, ng, statement, na, ito, ..."
2,3,"peso, weak, dollar, strong",Anonymous,05/20,628,535,Unidentified,"The Peso is not weak, but the dollar is reall...","Text,Image",12/12/22 13:03,Emotional,1,1,0,0,,"[peso, weak, dollar, really, strong, uh, 😥, ht..."
3,4,"peso, weak, dollar, strong",Anonymous,09/22,78,0,Philippines,Is the peso strong because the dollar is weak ...,"Text,Quote",12/7/22 22:34,Emotional,1,0,0,0,https://twitter.com/PhilstarNews/status/160041...,"[peso, strong, dollar, weak, na, ba]"
4,5,"peso, dollar, sandro",Identified,04/10,157,84,Philippines,“the peso is not weak because the peso is weak...,"Text,Quote",12/6/22 21:09,Rational,0,0,0,0,https://twitter.com/cnnphilippines/status/1599...,"[peso, weak, peso, weak, peso, weak, dollar, s..."


In [None]:
# Stemming
from nltk.stem import SnowballStemmer
nltk.download('wordnet')

stemmer = SnowballStemmer('english')
df['tweetNLP'] = df['tweetNLP'].apply(lambda x: [stemmer.stem(y) for y in x])
df.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,ID,keywords,acountType,joinedDate,following,followers,location,tweet,tweetType,datePosted,contentType,likes,replies,retweets,quoteTweets,quotedTweet,tweetNLP
0,1,"peso, weak, dollar, strong",Identified,04/10,360,558,Philippines,600 per kilo? Edi wag bumili ng 1 kilo.\nSolut...,Text,12/29/22 17:46,"Rational, Emotional",10,0,5,0,,"[600, per, kilo, edi, wag, bumili, ng, 1, kilo..."
1,2,"peso, weak, dollar, strong",Anonymous,06/10,573,1170,Unidentified,"Ang tawag sa uri ng statement na ito ay ""Sandr...","Text,Image",12/24/22 21:57,Rational,14,1,0,0,,"[ang, tawag, sa, uri, ng, statement, na, ito, ..."
2,3,"peso, weak, dollar, strong",Anonymous,05/20,628,535,Unidentified,"The Peso is not weak, but the dollar is reall...","Text,Image",12/12/22 13:03,Emotional,1,1,0,0,,"[peso, weak, dollar, realli, strong, uh, 😥, ht..."
3,4,"peso, weak, dollar, strong",Anonymous,09/22,78,0,Philippines,Is the peso strong because the dollar is weak ...,"Text,Quote",12/7/22 22:34,Emotional,1,0,0,0,https://twitter.com/PhilstarNews/status/160041...,"[peso, strong, dollar, weak, na, ba]"
4,5,"peso, dollar, sandro",Identified,04/10,157,84,Philippines,“the peso is not weak because the peso is weak...,"Text,Quote",12/6/22 21:09,Rational,0,0,0,0,https://twitter.com/cnnphilippines/status/1599...,"[peso, weak, peso, weak, peso, weak, dollar, s..."


# FIXING THE EMOJIS



In [None]:
# 'Emoji_Dict.p'- download link https://drive.google.com/open?id=1G1vIkkbqPBYPKHcQ8qy0G2zkoab2Qv4v
with open('Emoji_Dict.p', 'rb') as fp:
    Emoji_Dict = pickle.load(fp)
Emoji_Dict = {v: k for k, v in Emoji_Dict.items()}

def convert_emojis_to_word(text):
    for emot in Emoji_Dict:
        text = re.sub(r'('+emot+')', "_".join(Emoji_Dict[emot].replace(",","").replace(":","").split()), text)
    return text

text = "I won 🥇 in 🏏"
convert_emojis_to_word(text)

FileNotFoundError: ignored