# CSC-583
# HW-3
# Hithesh Shanmugam

# Task 1-Baseline lexicon-based classifier

## Separating training and testing set

In [1]:
import os
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
# set the paths to the directories containing the positive and negative review folders
pos_dir = 'C:/Users/sures/OneDrive - DePaul University/Desktop/pos'
neg_dir = 'C:/Users/sures/OneDrive - DePaul University/Desktop/neg'

# create empty lists to store the file paths for training and test sets
train_files = []
test_files = []

# iterate through the positive and negative review folders
for label, directory in [('pos', pos_dir), ('neg', neg_dir)]:
    for filename in os.listdir(directory):
        if filename.startswith('cv8') or filename.startswith('cv9'):
            # add file to test set
            test_files.append(os.path.join(directory, filename))
        else:
            # add file to training set
            train_files.append(os.path.join(directory, filename))

print('Number of training files:', len(train_files))
print('Number of test files:', len(test_files))

Number of training files: 1600
Number of test files: 400


## Creating sets for the pos and neg lexicons

In [2]:
# create a set of positive and negative words from the sentiment lexicon
positive_words = set()
negative_words = set()
with open('C:/Users/sures/OneDrive - DePaul University/Desktop/positive-words.txt', 'r') as f:
    for line in f:
        if not line.startswith(';'):
            positive_words.add(line.strip())
with open('C:/Users/sures/OneDrive - DePaul University/Desktop/negative-words.txt', 'r') as f:
    for line in f:
        if not line.startswith(';'):
            negative_words.add(line.strip())

## Classifying and evaluating the test set

In [3]:
# classify the documents in the training set
train_results = []
for file in train_files:
    with open(file, 'r') as f:
        text = f.read()
        tokens = word_tokenize(text.lower())
        
        # keeping count of positive and negative word counts
        num_pos_words = sum(1 for token in tokens if token in positive_words)
        num_neg_words = sum(1 for token in tokens if token in negative_words)
        
        if num_pos_words > num_neg_words:
            train_results.append(1)
        else:
            train_results.append(0)

# classify the documents in the testing set
test_results = []
for file in test_files:
    with open(file, 'r') as f:
        text = f.read()
        tokens = word_tokenize(text.lower())
        
        # keeping count of positive and negative word counts
        num_pos_words = sum(1 for token in tokens if token in positive_words)
        num_neg_words = sum(1 for token in tokens if token in negative_words)
        
        if num_pos_words > num_neg_words:
            test_results.append(1)
        else:
            test_results.append(0)

# compute precision, recall, f1 score, and accuracy on the test set
tp = sum([1 for i in range(len(test_files)) if test_results[i] == 1 and 'pos' in test_files[i]])
fp = sum([1 for i in range(len(test_files)) if test_results[i] == 1 and 'neg' in test_files[i]])
tn = sum([1 for i in range(len(test_files)) if test_results[i] == 0 and 'neg' in test_files[i]])
fn = sum([1 for i in range(len(test_files)) if test_results[i] == 0 and 'pos' in test_files[i]])

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * precision * recall / (precision + recall)
accuracy = (tp + tn) / (tp + tn + fp + fn)

print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1_score))
print("Accuracy: {:.2f}".format(accuracy))

Precision: 0.75
Recall: 0.64
F1 Score: 0.69
Accuracy: 0.71


# Task 2 - Logistic Regression Classifier

In [4]:
# Set the paths to the directories containing the positive and negative review folders
pos_dir = 'C:/Users/sures/OneDrive - DePaul University/Desktop/pos'
neg_dir = 'C:/Users/sures/OneDrive - DePaul University/Desktop/neg'

# Create empty lists to store the file paths for training and test sets
train_files = []
test_files = []

# Iterate through the positive and negative review folders
for label, directory in [('pos', pos_dir), ('neg', neg_dir)]:
    for filename in os.listdir(directory):
        if filename.startswith('cv8') or filename.startswith('cv9'):
            # Add file to test set
            test_files.append(os.path.join(directory, filename))
        else:
            # Add file to training set
            train_files.append(os.path.join(directory, filename))

print('Number of training files:', len(train_files))
print('Number of test files:', len(test_files))

# Function to get the last paragraph
def get_last_paragraph(text):
    paragraphs = text.split('\n\n')
    if len(paragraphs) > 1:
        return paragraphs[-2]
    else:
        return ""

def extract_features(text, last_paragraph=True):
    # Get the last paragraph
    if last_paragraph:
        last_paragraph_text = get_last_paragraph(text)
    
    # Get the word tokens
    tokens = word_tokenize(text.lower())
    
    # Extract features
    features = {}
    # Feature 1: Number of tokens in the review
    features['num_tokens'] = len(tokens)
    # Feature 2: Number of positive words
    features['num_positive'] = sum([1 for token in tokens if token in positive_words])
    # Feature 3: Number of negative words
    features['num_negative'] = sum([1 for token in tokens if token in negative_words])
    # Feature 4: Number of exclamation marks
    features['num_exclamation'] = text.count('!')
    # Feature 5: Number of question marks
    features['num_question'] = text.count('?')
    # Feature 6: Number of words in last paragraph
    if last_paragraph:
        features['num_last_paragraph'] = len(word_tokenize(last_paragraph_text))
    # Feature 7: Number of words in the longest sentence
    sentences = sent_tokenize(text)
    features['num_longest_sentence'] = max([len(word_tokenize(sent)) for sent in sentences])
    # Feature 8: Number of unique words
    unique_words = set(tokens)
    features['num_unique_words'] = len(unique_words)
    
    return features

def preprocess(files, last_paragraph=True):
    dataset = []
    for file in files:
        with open(file, 'r', encoding='utf-8') as f:
            text = f.read()
            label = 1 if 'pos' in file else 0
            features = extract_features(text, last_paragraph=last_paragraph)
            dataset.append((features, label))
    return dataset


# Preprocess the data
train_data = preprocess(train_files, last_paragraph=True)
test_data = preprocess(test_files, last_paragraph=True)

# Logistic regression classifier begins
class LogisticRegression:
    def __init__(self, num_features):
        self.num_features = num_features
        self.weights = np.zeros((num_features, 32))
        self.bias = 0

    # Sigmoid function to calculate the sigmoid 
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def predict(self, x):
        z = np.dot(x, self.weights) + self.bias
        y_pred = self.sigmoid(z)
        y_pred = y_pred.reshape(-1, 1)  # Reshape to (num_samples, 1)
        return y_pred

    def cross_entropy_loss(self, y_pred, y_true):
        epsilon = 1e-12
        y_pred = np.clip(y_pred, epsilon, 1. - epsilon)
        loss = - y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred)
        return loss

    def train(self, x, y, num_epochs=100, batch_size=32, lr=0.01):
        num_batches = int(np.ceil(len(x) / batch_size))
        losses = []
        
        for epoch in range(num_epochs):
            epoch_loss = 0
            
            # Shuffle the training data for each epoch
            indices = np.random.permutation(len(x))
            x = x[indices]
            y = y[indices]
            
            # Split the training data into batches
            for batch in range(num_batches):
                start = batch * batch_size
                end = (batch + 1) * batch_size
                x_batch = x[start:end]
                y_batch = y[start:end]
                
                # Convert a list of weights to a NumPy array
                self.weights = np.array(self.weights)
                # Compute the predictions and gradients for the current batch
                z = np.dot(np.array(x_batch), self.weights) + self.bias
                y_pred = self.sigmoid(z)
                loss_grad = y_pred - y_batch
                weights_grad = np.dot(x_batch.T, loss_grad) / batch_size
                bias_grad = np.sum(loss_grad) / batch_size
                
                # Update the weights and bias using mini-batch gradient descent
                self.weights -= lr * weights_grad
                self.bias -= lr * bias_grad
                
                # Compute the loss for the current batch
                batch_loss = np.mean(self.cross_entropy_loss(y_pred, y_batch))
                epoch_loss += batch_loss
            
            # Compute the average loss for the epoch
            epoch_loss /= num_batches
            losses.append(epoch_loss)
            
            # Print the epoch number and average loss for the epoch
            print(f"Epoch {epoch + 1}/{num_epochs}: loss={epoch_loss:.4f}")
            
        return self.weights, self.bias, losses

    def evaluate(self, x, y):
        pred_y = self.predict(x)
        pred_y[pred_y >= 0.5] = 1
        pred_y[pred_y < 0.5] = 0
        accuracy = np.mean(pred_y == y)
        tp = np.sum((pred_y == 1) & (y == 1))
        fp = np.sum((pred_y == 1) & (y == 0))
        fn = np.sum((pred_y == 0) & (y == 1))
        precision = tp / (tp + fp) if tp + fp > 0 else 0
        recall = tp / (tp + fn) if tp + fn > 0 else 0
        f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        return accuracy, precision, recall, f1_score



# Define the features to be used
features = ['num_tokens', 'num_positive', 'num_negative', 'num_exclamation', 'num_question', 'num_last_paragraph', 'num_longest_sentence', 'num_unique_words' ]

# Preprocess the data
train_data = preprocess(train_files, features)
test_data = preprocess(test_files, features)

Number of training files: 1600
Number of test files: 400


## Training and Testing

In [5]:
# Splitting the labels and data separately for testing and training
train_x = np.array([[sample[0]['num_tokens'], sample[0]['num_positive'], sample[0]['num_negative'], sample[0]['num_exclamation'], sample[0]['num_question'], sample[0]['num_last_paragraph'], sample[0]['num_longest_sentence'], sample[0]['num_unique_words']] for sample in train_data])
train_y = np.array([sample[1] for sample in train_data])
test_x = np.array([[sample[0]['num_tokens'], sample[0]['num_positive'], sample[0]['num_negative'], sample[0]['num_exclamation'], sample[0]['num_question'], sample[0]['num_last_paragraph'], sample[0]['num_longest_sentence'], sample[0]['num_unique_words']] for sample in test_data])
test_y = np.array([sample[1] for sample in test_data])

## Results on training and testing

In [6]:
# Train the logistic regression model
log_reg = LogisticRegression(num_features=len(features))
weights, bias, loss = log_reg.train(train_x, train_y, num_epochs=100, batch_size=32)

# Evaluate the model
accuracy, precision, recall, f1_score = log_reg.evaluate(test_x, test_y)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1_score)

  return 1 / (1 + np.exp(-x))


Epoch 1/100: loss=13.8634
Epoch 2/100: loss=12.9380
Epoch 3/100: loss=13.8001
Epoch 4/100: loss=13.6662
Epoch 5/100: loss=13.2703
Epoch 6/100: loss=13.9232
Epoch 7/100: loss=14.0879
Epoch 8/100: loss=14.0971
Epoch 9/100: loss=13.7216
Epoch 10/100: loss=13.9732
Epoch 11/100: loss=13.8950
Epoch 12/100: loss=13.9892
Epoch 13/100: loss=12.8483
Epoch 14/100: loss=14.0365
Epoch 15/100: loss=14.0245
Epoch 16/100: loss=13.9342
Epoch 17/100: loss=13.4508
Epoch 18/100: loss=13.6826
Epoch 19/100: loss=14.4313
Epoch 20/100: loss=13.5179
Epoch 21/100: loss=13.7456
Epoch 22/100: loss=13.8391
Epoch 23/100: loss=13.9687
Epoch 24/100: loss=14.0636
Epoch 25/100: loss=14.7791
Epoch 26/100: loss=13.6299
Epoch 27/100: loss=13.7478
Epoch 28/100: loss=13.8326
Epoch 29/100: loss=13.4038
Epoch 30/100: loss=13.8248
Epoch 31/100: loss=13.5498
Epoch 32/100: loss=13.8447
Epoch 33/100: loss=14.5837
Epoch 34/100: loss=13.2080
Epoch 35/100: loss=14.2987
Epoch 36/100: loss=13.3488
Epoch 37/100: loss=13.6158
Epoch 38/1

# Task 3 - Ablation study

In [7]:
# Define the features to be used
features = ['num_tokens', 'num_positive', 'num_negative', 'num_exclamation', 'num_question', 'num_last_paragraph', 'num_longest_sentence', 'num_unique_words' ]
# Conduct the ablation study
for feature in features:
    # Create a new feature set with the current feature removed
    ablation_features = [f for f in features]

    # Train and evaluate the model with the current feature removed
    log_reg_ablation = LogisticRegression(num_features=len(ablation_features))
    weights, bias, loss = log_reg_ablation.train(train_x, train_y, num_epochs=100, batch_size=32)
    accuracy, precision, recall, f1_score = log_reg_ablation.evaluate(test_x, test_y)

    # Print the performance metrics for the current feature set
    print('\n*****************\n')
    print(f'Ablation study for {feature}:')
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 Score:', f1_score)
    print('\n*****************\n')

Epoch 1/100: loss=13.5686
Epoch 2/100: loss=14.3139
Epoch 3/100: loss=14.2889
Epoch 4/100: loss=13.7936
Epoch 5/100: loss=13.7368
Epoch 6/100: loss=13.7474
Epoch 7/100: loss=13.6964
Epoch 8/100: loss=13.3659
Epoch 9/100: loss=13.6179
Epoch 10/100: loss=13.5810
Epoch 11/100: loss=14.0588
Epoch 12/100: loss=14.3769
Epoch 13/100: loss=13.7891
Epoch 14/100: loss=13.6848
Epoch 15/100: loss=13.9144


  return 1 / (1 + np.exp(-x))


Epoch 16/100: loss=14.2063
Epoch 17/100: loss=13.1620
Epoch 18/100: loss=14.3511
Epoch 19/100: loss=13.1247
Epoch 20/100: loss=14.4036
Epoch 21/100: loss=13.7724
Epoch 22/100: loss=14.0434
Epoch 23/100: loss=13.8547
Epoch 24/100: loss=14.0570
Epoch 25/100: loss=14.2449
Epoch 26/100: loss=14.1244
Epoch 27/100: loss=13.7310
Epoch 28/100: loss=13.4969
Epoch 29/100: loss=14.3189
Epoch 30/100: loss=13.4690
Epoch 31/100: loss=13.8341
Epoch 32/100: loss=13.8035
Epoch 33/100: loss=13.3573
Epoch 34/100: loss=14.0350
Epoch 35/100: loss=13.7090
Epoch 36/100: loss=13.7516
Epoch 37/100: loss=13.9365
Epoch 38/100: loss=12.8722
Epoch 39/100: loss=14.2356
Epoch 40/100: loss=14.1286
Epoch 41/100: loss=14.3425
Epoch 42/100: loss=14.1203
Epoch 43/100: loss=13.9220
Epoch 44/100: loss=14.0362
Epoch 45/100: loss=13.2676
Epoch 46/100: loss=14.3185
Epoch 47/100: loss=13.3978
Epoch 48/100: loss=14.3149
Epoch 49/100: loss=13.8854
Epoch 50/100: loss=13.3420
Epoch 51/100: loss=13.6076
Epoch 52/100: loss=13.9426
E