In [None]:
# LexiEmo: Exploring Emotions through Lexicon-Based Sentiment Analysis

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from textblob import TextBlob
from nltk.corpus import stopwords
nltk.download('sentiwordnet')
from nltk.corpus import sentiwordnet as swn
nltk.download('wordnet')
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import string

#remove annoying warnings from sklearn
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
# Dataset Load
data = pd.read_csv('Twitter_Data.csv') # Load the CSV file
text = data['clean_text'] # Extract the "clean_text"
category = data['category'] # Extract the "category" columns

In [None]:
# Pre-processing
def preprocess_text(text):
    if isinstance(text, str):
        text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
        text = text.lower() # Convert to lowercase
        tokens = word_tokenize(text) # Tokenize the text
        stop_words = set(stopwords.words('english')) # Remove stopwords if necessary
        tokens = [token for token in tokens if token not in stop_words]
        text = ' '.join(tokens) # Join the tokens back into a single string
    return text

preprocessed_text = text.apply(preprocess_text)
preprocessed_text = preprocessed_text[category.notnull()]
category = category[category.notnull()]

In [None]:
# Each Label Count
label_counts = data['category'].value_counts() # Calculate the count of each label
print("Each Label Counts:",label_counts)
labels = ['Positive(1)', 'Neutral(0)', 'Negative(-1)'] # Define the labels for the donut chart
mapped_labels = {1: 'Positive(1)', 0: 'Neutral(0)', -1: 'Negative(-1)'} # Map the labels based on the values
print("Each Mapped Label:",mapped_labels)
mapped_label_counts = [label_counts[label] for label in mapped_labels.keys()] # Get the counts for the mapped labels
light_colors = ['#FFBBBB', '#BBFFBB', '#BBBBFF'] # Define the light colors for each label

plt.figure(figsize=(6, 4)) # Create a donut chart
plt.pie(mapped_label_counts, labels=[f"{mapped_labels[label]}: {count}" for label, count in zip(mapped_labels.keys(), mapped_label_counts)],
        autopct='%1.1f%%', startangle=90, colors=light_colors)

center_circle = plt.Circle((0, 0), 0.6, color='white') # Add a circle at the center to create a donut shape
plt.gca().add_artist(center_circle)
plt.title('Distribution of Tweets by Labels') # Add a title
plt.show() # Display the donut chart

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(preprocessed_text, category, test_size=0.2, random_state=42)
print("X Train Shape:",X_train.shape)
print("X Test Shape:",X_test.shape)
print("Y Train Shape:",y_train.shape)
print("Y Test Shape:",y_test.shape)

In [None]:
# TextBlob Sentiment Analysis
blob_polarity = [TextBlob(text).sentiment.polarity if isinstance(text, str) else None for text in X_train]
blob_predictions = [1 if polarity and polarity > 0 else -1 if polarity and polarity < 0 else 0 for polarity in blob_polarity]
blob_accuracy = accuracy_score(y_train, blob_predictions)
blob_precision = precision_score(y_train, blob_predictions, average='macro')
blob_recall = recall_score(y_train, blob_predictions, average='macro')
blob_f1 = f1_score(y_train, blob_predictions, average='macro')
blob_confusion_matrix = confusion_matrix(y_train, blob_predictions)

print("TextBlob Accuracy:",blob_accuracy)
print("TextBlob Precision:",blob_precision)
print("TextBlob Recall:",blob_recall)
print("TextBlob F1:",blob_f1)
print("TextBlob Confusion Matrix:",blob_confusion_matrix)

# Visualize confusion matrix
labels = ['Positive', 'Neutral', 'Negative'] # Define the labels for the confusion matrix
fig, ax = plt.subplots(figsize=(6, 4)) # Plot the confusion matrix
im = ax.imshow(blob_confusion_matrix, cmap='Greens')
cbar = ax.figure.colorbar(im, ax=ax) # Set the colorbar
ax.set(xticks=np.arange(blob_confusion_matrix.shape[1]), # Set the labels
       yticks=np.arange(blob_confusion_matrix.shape[0]),
       xticklabels=labels, yticklabels=labels,
       title='TextBlob Confusion Matrix',
       ylabel='True label',
       xlabel='Predicted label')
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", # Rotate the x-axis labels
         rotation_mode="anchor")
for i in range(blob_confusion_matrix.shape[0]): # Loop over the data and create text annotations
    for j in range(blob_confusion_matrix.shape[1]):
        ax.text(j, i, format(blob_confusion_matrix[i, j], 'd'),
                ha="center", va="center",
                color="white" if blob_confusion_matrix[i, j] > np.max(blob_confusion_matrix) / 2 else "black")
plt.show() # Show the plot

In [None]:
# SentiWordNet Sentiment Analysis
def calculate_sentiment_score(text):
    pos_score = 0
    neg_score = 0
    for word in word_tokenize(text):
        synsets = list(swn.senti_synsets(word))
        if synsets:
            synset = synsets[0]
            pos_score += synset.pos_score()
            neg_score += synset.neg_score()
    return pos_score - neg_score

swn_polarity = [calculate_sentiment_score(text) if isinstance(text, str) else None for text in X_train]
swn_predictions = [1 if polarity and polarity > 0 else -1 if polarity and polarity < 0 else 0 for polarity in swn_polarity]
swn_accuracy = accuracy_score(y_train, swn_predictions)
swn_precision = precision_score(y_train, swn_predictions, average='macro')
swn_recall = recall_score(y_train, swn_predictions, average='macro')
swn_f1 = f1_score(y_train, swn_predictions, average='macro')
swn_confusion_matrix = confusion_matrix(y_train, swn_predictions)

print("SentiWordNet Accuracy:",swn_accuracy)
print("SentiWordNet Precision:",swn_precision)
print("SentiWordNet Recall:",swn_recall)
print("SentiWordNet F1:",swn_f1)
print("SentiWordNet Confusion Matrix:",swn_confusion_matrix)

# Visualize confusion matrix
labels = ['Positive', 'Neutral', 'Negative'] # Define the labels for the confusion matrix
fig, ax = plt.subplots(figsize=(6, 4)) # Plot the confusion matrix
im = ax.imshow(swn_confusion_matrix, cmap='Blues')
cbar = ax.figure.colorbar(im, ax=ax) # Set the colorbar
ax.set(xticks=np.arange(swn_confusion_matrix.shape[1]), # Set the labels
       yticks=np.arange(swn_confusion_matrix.shape[0]),
       xticklabels=labels, yticklabels=labels,
       title='SentiWordNet Confusion Matrix',
       ylabel='True label',
       xlabel='Predicted label')
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", # Rotate the x-axis labels
         rotation_mode="anchor")
for i in range(swn_confusion_matrix.shape[0]): # Loop over the data and create text annotations
    for j in range(swn_confusion_matrix.shape[1]):
        ax.text(j, i, format(swn_confusion_matrix[i, j], 'd'),
                ha="center", va="center",
                color="white" if swn_confusion_matrix[i, j] > np.max(swn_confusion_matrix) / 2 else "black")
plt.show() # Show the plot

In [None]:
# VADER Sentiment Analysis
sid = SentimentIntensityAnalyzer()
vader_polarity = [sid.polarity_scores(text)['compound'] if isinstance(text, str) else 0 for text in X_train]
vader_predictions = [1 if polarity > 0 else -1 if polarity < 0 else 0 for polarity in vader_polarity]
vader_accuracy = accuracy_score(y_train, vader_predictions)
vader_precision = precision_score(y_train, vader_predictions, average='macro')
vader_recall = recall_score(y_train, vader_predictions, average='macro')
vader_f1 = f1_score(y_train, vader_predictions, average='macro')
vader_confusion_matrix = confusion_matrix(y_train, vader_predictions)

print("VADER Accuracy:",vader_accuracy)
print("VADER Precision:",vader_precision)
print("VADER Recall:",vader_recall)
print("VADER F1:",vader_f1)
print("VADER Confusion Matrix:",vader_confusion_matrix)

# Visualize confusion matrix
labels = ['Positive', 'Neutral', 'Negative'] # Define the labels for the confusion matrix
fig, ax = plt.subplots(figsize=(6, 4)) # Plot the confusion matrix
im = ax.imshow(vader_confusion_matrix, cmap='Oranges')
cbar = ax.figure.colorbar(im, ax=ax) # Set the colorbar
ax.set(xticks=np.arange(vader_confusion_matrix.shape[1]), # Set the labels
       yticks=np.arange(vader_confusion_matrix.shape[0]),
       xticklabels=labels, yticklabels=labels,
       title='VADER Confusion Matrix',
       ylabel='True label',
       xlabel='Predicted label')
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", # Rotate the x-axis labels
         rotation_mode="anchor")
for i in range(vader_confusion_matrix.shape[0]): # Loop over the data and create text annotations
    for j in range(vader_confusion_matrix.shape[1]):
        ax.text(j, i, format(vader_confusion_matrix[i, j], 'd'),
                ha="center", va="center",
                color="white" if vader_confusion_matrix[i, j] > np.max(vader_confusion_matrix) / 2 else "black")
plt.show() # Show the plot

In [None]:
# NB Model Training
X_train_cleaned = np.where(pd.isnull(X_train), "", X_train) # Replace missing values with an empty string
vectorizer = TfidfVectorizer() # Fit and transform the vectorizer
X_train_vectorized = vectorizer.fit_transform(X_train_cleaned)
# Split the preprocessed data into training and testing sets
X_train_model, X_test_model, y_train_model, y_test_model = train_test_split(X_train_vectorized, y_train, test_size=0.2, random_state=42)
nb_classifier = MultinomialNB() # Train a Naive Bayes classifier
nb_classifier.fit(X_train_model, y_train_model)
nb_predictions = nb_classifier.predict(X_test_model) # Evaluate the trained model on the testing data
nb_accuracy = accuracy_score(y_test_model, nb_predictions)
nb_precision = precision_score(y_test_model, nb_predictions, average='macro')
nb_recall = recall_score(y_test_model, nb_predictions, average='macro')
nb_f1 = f1_score(y_test_model, nb_predictions, average='macro')
nb_confusion_matrix = confusion_matrix(y_test_model, nb_predictions)

print("Naive Bayes Accuracy:",nb_accuracy)
print("Naive Bayes Precision:",nb_precision)
print("Naive Bayes Recall:",nb_recall)
print("Naive Bayes F1:",nb_f1)
print("Naive Bayes Confusion Matrix:",nb_confusion_matrix)

# Visualize confusion matrix
labels = ['Positive', 'Neutral', 'Negative'] # Define the labels for the confusion matrix
fig, ax = plt.subplots(figsize=(6, 4)) # Plot the confusion matrix
im = ax.imshow(nb_confusion_matrix, cmap='YlOrRd')
cbar = ax.figure.colorbar(im, ax=ax) # Set the colorbar
ax.set(xticks=np.arange(nb_confusion_matrix.shape[1]), # Set the labels
       yticks=np.arange(nb_confusion_matrix.shape[0]),
       xticklabels=labels, yticklabels=labels,
       title='Naive Bayes Confusion Matrix',
       ylabel='True label',
       xlabel='Predicted label')
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", # Rotate the x-axis labels
         rotation_mode="anchor")
for i in range(nb_confusion_matrix.shape[0]): # Loop over the data and create text annotations
    for j in range(nb_confusion_matrix.shape[1]):
        ax.text(j, i, format(nb_confusion_matrix[i, j], 'd'),
                ha="center", va="center",
                color="white" if nb_confusion_matrix[i, j] > np.max(nb_confusion_matrix) / 2 else "black")
plt.show() # Show the plot

In [None]:
# Compare Each Methods Label
methods = ['TextBlob', 'SentiWordNet', 'VADER', 'Trained Model']
label_counts_per_method = []
blob_label_counts = pd.Series(blob_predictions).value_counts().reindex([-1, 0, 1], fill_value=0)
label_counts_per_method.append(blob_label_counts)
swn_label_counts = pd.Series(swn_predictions).value_counts().reindex([-1, 0, 1], fill_value=0)
label_counts_per_method.append(swn_label_counts)
vader_label_counts = pd.Series(vader_predictions).value_counts().reindex([-1, 0, 1], fill_value=0)
label_counts_per_method.append(vader_label_counts)
model_label_counts = pd.Series(nb_predictions).value_counts().reindex([-1, 0, 1], fill_value=0)
label_counts_per_method.append(model_label_counts)

print("BlobText:",blob_label_counts)
print("SentiWordNet:",swn_label_counts)
print("Vader:",vader_label_counts)
print("NaiveBayes:",model_label_counts)

custom_colors = ['#FF69B4', '#8A2BE2', '#FFA500', '#00CED1'] # Define custom colors for each sentiment label
plt.figure(figsize=(6, 4)) # Plot the stacked bar chart with custom colors
x = np.arange(len(labels))
width = 0.6
bottom = np.zeros(len(labels))
for i, method in enumerate(methods):
    plt.bar(x, label_counts_per_method[i], width=width, bottom=bottom, label=method, color=custom_colors[i % len(custom_colors)])
    bottom += label_counts_per_method[i]

plt.xlabel('Sentiment Labels')
plt.ylabel('Count')
plt.xticks(x, labels)
plt.title('Composition of Sentiment Labels by Methods')
plt.legend()
plt.show()

In [None]:
# Compare Results
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']
blob_scores = [blob_accuracy, blob_precision, blob_recall, blob_f1]
swn_scores = [swn_accuracy, swn_precision, swn_recall, swn_f1]
vader_scores = [vader_accuracy, vader_precision, vader_recall, vader_f1]
model_scores = [nb_accuracy, nb_precision, nb_recall, nb_f1]

plt.figure(figsize=(6, 4))
x = range(len(metrics))
plt.bar(x, blob_scores, width=0.2, label='TextBlob')
plt.bar([i + 0.2 for i in x], swn_scores, width=0.2, label='SentiWordNet')
plt.bar([i + 0.4 for i in x], vader_scores, width=0.2, label='VADER')
plt.bar([i + 0.6 for i in x], model_scores, width=0.2, label='Trained Model')
plt.xlabel('Metrics')
plt.ylabel('Scores')
plt.xticks([i + 0.3 for i in x], metrics)
plt.title('Comparison of Sentiment Analysis Approaches')
plt.legend()
plt.show()