In [1]:
#1. Import Libraries

In [2]:
#step 1 - import all libraries for data handling, feature extraction, clustering and plotting
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

In [3]:
#2. Data Loading Functions

In [4]:
#step 2 - define function to load tweets data from the Dataset folder
def load_tweets_data():
    #use relative path
    tweets_data = pd.read_csv("../Dataset/tweets.csv")
    return tweets_data

In [5]:
#step 3 - define function to load NRC emotion lexicon from the Lexicon folder
def load_nrc_lexicon():
    #use relative path
    lexicon_path = "../Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
    lexicon = {}
    with open(lexicon_path, "r", encoding="utf-8") as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 3:
                word, emotion, value = parts
                if word not in lexicon:
                    lexicon[word] = {}
                lexicon[word][emotion] = int(value)
    return lexicon

In [6]:
#testing of 2. Data Loading Functions
tweets = load_tweets_data()
lexicon = load_nrc_lexicon()
print(tweets.head())
print(list(lexicon.items())[:5])

   id                                               link  \
0   0  https://twitter.com/HackneyPSC/status/17274436...   
1   1  https://twitter.com/cherrysattitude/status/172...   
2   2  https://twitter.com/diamoundgirls2/status/1710...   
3   3  https://twitter.com/mmtchi/status/172764634165...   
4   4  https://twitter.com/NoahIeeNG/status/172744319...   

                                                text              date  likes  \
0  A statement from psychoanalytic activists:  Th...  11/22/2023 21:47      0   
1                        bak bak bak bak doyamadınız  11/22/2023 15:27    443   
2  Check out 🏒 35 + different ERIK KARLSSON cards...    10/7/2023 7:15      0   
3  Il s'en passe des trucs pendant qu'on vous ori...  11/23/2023 11:12    381   
4  AW OKAY.. WELL THATS COOL, IM SURE PAL WILL AP...  11/22/2023 21:45      0   

   comments  
0         0  
1         9  
2         0  
3        44  
4         0  
[('aback', {'anger': 0, 'anticipation': 0, 'disgust': 0, 'fear': 0, 

In [7]:
#3. Data Preprocessing

In [8]:
#step 1 - define function to preprocess the tweets data
def preprocess_tweets_data(tweets_data):
    #remove rows with missing tweets
    tweets_data = tweets_data.dropna(subset=["text"]).copy()
    #convert text to lowercase for consistency
    tweets_data["text"] = tweets_data["text"].str.lower()
    #remove links
    tweets_data["text"] = tweets_data["text"].apply(lambda x: re.sub(r"http\S+|www\S+|https\S+", "", x))
    return tweets_data

In [9]:
#testing
tweets = load_tweets_data()
tweets = preprocess_tweets_data(tweets)
print(tweets.head())

   id                                               link  \
0   0  https://twitter.com/HackneyPSC/status/17274436...   
1   1  https://twitter.com/cherrysattitude/status/172...   
2   2  https://twitter.com/diamoundgirls2/status/1710...   
3   3  https://twitter.com/mmtchi/status/172764634165...   
4   4  https://twitter.com/NoahIeeNG/status/172744319...   

                                                text              date  likes  \
0  a statement from psychoanalytic activists:  th...  11/22/2023 21:47      0   
1                        bak bak bak bak doyamadınız  11/22/2023 15:27    443   
2  check out 🏒 35 + different erik karlsson cards...    10/7/2023 7:15      0   
3  il s'en passe des trucs pendant qu'on vous ori...  11/23/2023 11:12    381   
4  aw okay.. well thats cool, im sure pal will ap...  11/22/2023 21:45      0   

   comments  
0         0  
1         9  
2         0  
3        44  
4         0  


In [None]:
#4. Feature Extraction
