# Lab 5 - Supervised Sentiment Analysis

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load the full-corpus.csv dataset
full_corpus_df = pd.read_csv("/content/drive/MyDrive/NLP/full-corpus.csv")

# Extract the tweets and their sentiments
tweets_data = list(zip(full_corpus_df['TweetText'], full_corpus_df['Sentiment']))

# Split the data into positive and negative tweets
pos_tweets = [(text, 'Positive') for text, sentiment in tweets_data if sentiment.lower() == 'positive']
neg_tweets = [(text, 'Negative') for text, sentiment in tweets_data if sentiment.lower() == 'negative']

In [None]:
stopwords = {
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was",
    "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and",
    "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between",
    "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off",
    "over", "under", "again", "further", "then", "once", "both", "each", "few", "more", "most", "other", "some", "such",
    "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should",
    "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven", "isn",
    "ma", "mightn", "mustn", "needn", "shan", "shouldn", "wasn", "weren", "won", "wouldn", "here", "there", "when",
    "where", "why", "how", "all", "any", "both", "each", "more", "most", "other", "some", "such", "no", "nor", "not",
    "only", "own", "same", "so", "than", "too", "very", "few", "more", "most", "all", "any", "none", "some"
}

In [None]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('punkt')

In [None]:
def preprocess(tweet):
    tweet = tweet.lower()
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet)  # Convert URLs to the word URL
    tweet = re.sub('@[^\s]+', 'AT_USER', tweet)  # Convert @username to AT_USER
    tweet = re.sub('[\s]+', ' ', tweet)  # Remove additional whitespaces
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)  # Replace #word with word
    tweet = tweet.strip('\'"')  # Trim
    return tweet

In [None]:

# Extract word features
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [None]:
# Word tokenizer
def tokenize(tweet):
    return nltk.word_tokenize(tweet)

In [None]:
# Remove stopwords and get words from tweets
def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
        all_words.extend(tokenize(words))
    return all_words

In [None]:
# Extract features
def extract_features(document):
    document_words = set(tokenize(document))
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [None]:
# Replace the hardcoded sample tweets with the extracted tweets
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    tweets.append((preprocess(words), sentiment))

In [None]:
# Create word features and train the classifier
word_features = get_word_features(get_words_in_tweets(tweets))
training_set = nltk.classify.apply_features(extract_features, tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [None]:
# Test the classifier (optional, if you want to test with a sample tweet)
test_tweet = "It’s easy to be brave when you’re hiding behind a keyboard. You and your Hamas friends will regret your barbaric actions very soon."
print(classifier.classify(extract_features(preprocess(test_tweet))))

Negative


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Assuming tweets is a list of (tweet, sentiment) pairs
# Splitting data into 80% training and 20% testing
train_size = int(len(tweets) * 0.8)
train_tweets = tweets[:train_size]
test_tweets = tweets[train_size:]

In [None]:
# Train the classifier with the training data
training_set = nltk.classify.apply_features(extract_features, train_tweets)
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [None]:
# Predict the sentiments of the test set
predictions = [classifier.classify(extract_features(tweet)) for tweet, _ in test_tweets]
actual = [sentiment for _, sentiment in test_tweets]

In [None]:
# Calculate accuracy, precision, and recall
accuracy = accuracy_score(actual, predictions)
precision = precision_score(actual, predictions, average='weighted')
recall = recall_score(actual, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

# If you want a detailed report for each class (Positive, Negative, etc.)
print(classification_report(actual, predictions))

Accuracy: 0.1735
Precision: 1.0000
Recall: 0.1735
              precision    recall  f1-score   support

    Negative       1.00      0.17      0.30       219
    Positive       0.00      0.00      0.00         0

    accuracy                           0.17       219
   macro avg       0.50      0.09      0.15       219
weighted avg       1.00      0.17      0.30       219



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.metrics import fbeta_score

# Calculate F2 score
f2 = fbeta_score(actual, predictions, beta=2, average='weighted')

print(f"F2 Score: {f2:.4f}")

F2 Score: 0.2079


In [None]:
from sklearn.metrics import f1_score


# Calculate F1 score
f1 = f1_score(actual, predictions, average='weighted')

print(f"F1 Score: {f1:.4f}")

F1 Score: 0.2957
