In [18]:
!pip install nltk
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
#importing the important libraries
import os
import re
import requests
from collections import defaultdict, Counter
import math
import pandas as pd
import numpy as np
import string
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords

##Question 2: Implementing a Naive Bayes Classifier for Sentiment Analysis [40]

In [20]:
#Base URLs for the raw files and GitHub API
base_raw_url = 'https://raw.githubusercontent.com/Kushal-Chandani/NLP-Homeworks/main/Homework2/Data/Question2/'
base_api_url = 'https://api.github.com/repos/Kushal-Chandani/NLP-Homeworks/contents/Homework2/Data/Question2/'

#Folders and subfolders to access
folders = ['TEST', 'TRAIN']
subfolders = ['positive', 'negative']

headers = {
    'Accept': 'application/vnd.github.v3+json',
    'User-Agent': 'Python-requests'
}

def fetch_file_list(folder, subfolder):
    url = f'{base_api_url}{folder}/{subfolder}'
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        files = response.json()
        return [file['name'] for file in files if file['name'].endswith('.txt')]
    else:
        print(f"Failed to fetch file list from {url}, Status code: {response.status_code}")
        return []

def fetch_file_content(folder, subfolder, filename):
    file_url = f'{base_raw_url}{folder}/{subfolder}/{filename}'
    response = requests.get(file_url, headers=headers)

    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch file {filename}, Status code: {response.status_code}")
        return None

def get_subfolder_data(folder, subfolder, label):
    file_list = []

    #We will fetch file names dynamically
    filenames = fetch_file_list(folder, subfolder)

    #We will fetch the content for each file and store in list
    for filename in filenames:
        file_content = fetch_file_content(folder, subfolder, filename)
        if file_content:
            #Now Appending file name, content, and label to the list
            file_list.append([filename, file_content, label])

    #Converting the data to a Pandas DataFrame
    return pd.DataFrame(file_list, columns=['File Name', 'Content', 'Label'])

#Fetching the data for each subfolder with respective labels
df_test_negative = get_subfolder_data('TEST', 'negative', 0)  # Label 0 for negative
df_test_positive = get_subfolder_data('TEST', 'positive', 1)  # Label 1 for positive
df_train_negative = get_subfolder_data('TRAIN', 'negative', 0)
df_train_positive = get_subfolder_data('TRAIN', 'positive', 1)

#Combining the respective DataFrames for TEST and TRAIN datasets
df_test_combined = pd.concat([df_test_negative, df_test_positive], ignore_index=True)
df_train_combined = pd.concat([df_train_negative, df_train_positive], ignore_index=True)

In [21]:
df_train_combined

Unnamed: 0,File Name,Content,Label
0,0_3.txt,Story of a man who has unnatural feelings for ...,0
1,100_3.txt,OK its not the best film I've ever seen but at...,0
2,101_1.txt,"Amateur, no budget films can be surprisingly g...",0
3,102_1.txt,My girlfriend once brought around The Zombie C...,0
4,103_1.txt,"Without wishing to be a killjoy, Brad Sykes is...",0
...,...,...,...
997,96_10.txt,I was reviewing some old VHS tapes I have and ...,1
998,97_9.txt,this was a favorite Christmas Special that I w...,1
999,98_10.txt,"For me too, this Christmas special is one that...",1
1000,99_8.txt,A Christmas Together actually came before my t...,1


In [22]:
df_test_combined

Unnamed: 0,File Name,Content,Label
0,0_2.txt,Once again Mr. Costner has dragged out a movie...,0
1,100_4.txt,I was looking forward to this movie. Trustwort...,0
2,10_3.txt,"Years ago, when DARLING LILI played on TV, it ...",0
3,11_3.txt,Julie Andrews satirically prods her own goody-...,0
4,12_4.txt,Blake Edwards tried very hard to change Julie ...,0
...,...,...,...
197,96_10.txt,"Having been raised in Canada, I saw this short...",1
198,97_7.txt,"Well, I have to disagree with Leonard Maltin o...",1
199,98_9.txt,This short deals with a severely critical writ...,1
200,99_10.txt,"This movie, with all its complexity and subtle...",1


In [23]:
# Function to clean text (remove punctuation, convert to lowercase)
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text

# Tokenization and removing stop words
stop_words = set(stopwords.words('english'))
def tokenize(text):
    return [word for word in text.split() if word not in stop_words]

# Preprocess the reviews: Clean and tokenize
def preprocess_reviews(reviews):
    processed_reviews = []
    for review in reviews:
        cleaned = clean_text(review)
        tokens = tokenize(cleaned)
        processed_reviews.append(tokens)
    return processed_reviews

# Preprocess the training and test datasets
train_pos_reviews = preprocess_reviews(df_train_combined[df_train_combined['Label'] == 1]['Content'])
train_neg_reviews = preprocess_reviews(df_train_combined[df_train_combined['Label'] == 0]['Content'])
test_pos_reviews = preprocess_reviews(df_test_combined[df_test_combined['Label'] == 1]['Content'])
test_neg_reviews = preprocess_reviews(df_test_combined[df_test_combined['Label'] == 0]['Content'])


In [24]:
#checking
# train_pos_reviews
# test_neg_reviews

In [25]:
#Building vocabulary from the training data
def build_vocabulary(positive_reviews, negative_reviews):
    vocabulary = set()
    for review in positive_reviews + negative_reviews:
        vocabulary.update(review)
    return vocabulary

#Extracting features (word counts) for each review
def extract_features(vocabulary, reviews):
    feature_matrix = []
    for review in reviews:
        features = {word: 0 for word in vocabulary}
        for word in review:
            if word in vocabulary:
                features[word] += 1
        feature_matrix.append(features)
    return feature_matrix

#Building the vocabulary
vocabulary = build_vocabulary(train_pos_reviews, train_neg_reviews)

#Extracting features for both positive and negative reviews in the training set
train_pos_features = extract_features(vocabulary, train_pos_reviews)
train_neg_features = extract_features(vocabulary, train_neg_reviews)
test_pos_features = extract_features(vocabulary, test_pos_reviews)
test_neg_features = extract_features(vocabulary, test_neg_reviews)


In [26]:
#checking
# test_pos_features
# train_neg_features

In [27]:
#Calculating the prior probabilities
def calculate_class_priors(pos_reviews, neg_reviews):
    total_reviews = len(pos_reviews) + len(neg_reviews)
    pos_prior = len(pos_reviews) / total_reviews
    neg_prior = len(neg_reviews) / total_reviews
    return pos_prior, neg_prior

#Calculating the likelihood estimation
def calculate_word_likelihoods(vocabulary, pos_reviews, neg_reviews):
    pos_word_counts = defaultdict(int)
    neg_word_counts = defaultdict(int)
    total_pos_words = total_neg_words = 0

    for review in pos_reviews:
        for word, count in review.items():
            pos_word_counts[word] += count
            total_pos_words += count

    for review in neg_reviews:
        for word, count in review.items():
            neg_word_counts[word] += count
            total_neg_words += count

    pos_word_probs = {word: (pos_word_counts[word] + 1) / (total_pos_words + len(vocabulary)) for word in vocabulary} #assigning probabilities with laplace smoothing
    neg_word_probs = {word: (neg_word_counts[word] + 1) / (total_neg_words + len(vocabulary)) for word in vocabulary}

    return pos_word_probs, neg_word_probs


#Calculating posterior probabilities
def predict_review(vocabulary, review, pos_prior, neg_prior, pos_word_probs, neg_word_probs):
    pos_prob = math.log(pos_prior)
    neg_prob = math.log(neg_prior)

    for word, count in review.items():
        if word in vocabulary:
            pos_prob += count * math.log(pos_word_probs.get(word, 1 / (len(vocabulary) + 1)))
            neg_prob += count * math.log(neg_word_probs.get(word, 1 / (len(vocabulary) + 1)))

    return 1 if pos_prob > neg_prob else 0


In [28]:
#Calculating class priors of the dataset
pos_prior, neg_prior = calculate_class_priors(train_pos_features, train_neg_features)

#Calculating word likelihoods using Laplace smoothing
pos_word_probs, neg_word_probs = calculate_word_likelihoods(vocabulary, train_pos_features, train_neg_features)

#Predicting for all test reviews
test_reviews = test_pos_features + test_neg_features
test_labels = [1] * len(test_pos_features) + [0] * len(test_neg_features)

predictions = [predict_review(vocabulary, review, pos_prior, neg_prior, pos_word_probs, neg_word_probs) for review in test_reviews]


In [29]:
#checking
#predictions

In [30]:
#Calculating all the relevant values such as tp, tn, fp, fn
def calculate_confusion_matrix(labels, predictions):
    tp = sum(1 for true, pred in zip(labels, predictions) if true == 1 and pred == 1)
    tn = sum(1 for true, pred in zip(labels, predictions) if true == 0 and pred == 0)
    fp = sum(1 for true, pred in zip(labels, predictions) if true == 0 and pred == 1)
    fn = sum(1 for true, pred in zip(labels, predictions) if true == 1 and pred == 0)
    return tp, tn, fp, fn

#Calculating confusion matrix elements
tp, tn, fp, fn = calculate_confusion_matrix(test_labels, predictions)
conf = confusion_matrix(test_labels, predictions)
print(tp, tn, fp, fn)

#printing matrix
print(f"Confusion Matrix:")
print(f"{conf}")


71 91 10 30
Confusion Matrix:
[[91 10]
 [30 71]]


In [31]:
#Calculating the required matrix
def calculate_metrics(tp, tn, fp, fn):
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if tp + fp != 0 else 0
    recall = tp / (tp + fn) if tp + fn != 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0
    return accuracy, precision, recall, f1_score

#Calculating performance metrics
accuracy, precision, recall, f1_score = calculate_metrics(tp, tn, fp, fn)

#Printing the evaluation metrics
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1_score:.2f}")

Accuracy: 80.20%
Precision: 0.88
Recall: 0.70
F1-Score: 0.78
