# Text Classification

In [2]:
import pandas as pd
import numpy as np
import re

### Loading sentences and labels


In [3]:
sentences = pd.read_csv("stanfordSentimentTreebank/datasetSentences.txt", sep="\t")
labels = pd.read_csv("stanfordSentimentTreebank/sentiment_labels.txt", sep="|")


In [4]:
data = sentences.merge(labels, left_on="sentence_index", right_on="phrase ids", how="left")
data = data[["sentence", "sentiment values"]]

In [5]:
data.head()

Unnamed: 0,sentence,sentiment values
0,The Rock is destined to be the 21st Century 's...,0.5
1,The gorgeously elaborate continuation of `` Th...,0.44444
2,Effective but too-tepid biopic,0.5
3,If you sometimes like to go to the movies to h...,0.42708
4,"Emerges as something rare , an issue movie tha...",0.375


### Converting Scores to labels

In [6]:
def map_sentiment(score):
    if 0 <= score <= 0.2:
        return 0  
    elif 0.2 < score <= 0.4:
        return 1
    elif 0.4 < score <= 0.6:
        return 2
    elif 0.6 < score <= 0.8:
        return 3
    elif 0.8 < score <= 1.0:
        return 4  

In [7]:
data["label"] = data["sentiment values"].apply(map_sentiment)

In [8]:
data.head()

Unnamed: 0,sentence,sentiment values,label
0,The Rock is destined to be the 21st Century 's...,0.5,2
1,The gorgeously elaborate continuation of `` Th...,0.44444,2
2,Effective but too-tepid biopic,0.5,2
3,If you sometimes like to go to the movies to h...,0.42708,2
4,"Emerges as something rare , an issue movie tha...",0.375,1


## Part 2.1 - Implemenation of Naive Bayes Classification

In [9]:
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [10]:
class NaiveBayesClassifier:
    def __init__(self, alpha=1.0):
        """
        Initialize the Naive Bayes Classifier.
        alpha: Smoothing parameter for Laplace smoothing.
        """
        self.alpha = alpha  # Laplace smoothing
        self.class_priors = None  # Prior probabilities P(Class)
        self.word_probs = None  # Likelihood P(Word | Class)
        self.vocab = None  # Vocabulary

    
    def fit(self, X, y):
        """
        Train the classifier using text data.
        X: List of text samples (sentences).
        y: Corresponding class labels.
        """
        
        # Create vocabulary
        all_words = set(word for text in X for word in text.split())
        self.vocab = {word: i for i, word in enumerate(all_words)}
        V = len(self.vocab)  # Vocabulary size
        
        # Get unique class labels
        classes = np.unique(y)
        num_classes = len(classes)
        
        # Initialize probability tables
        self.class_priors = np.zeros(num_classes)
        word_counts = np.zeros((num_classes, V))  # Word frequency per class
        class_counts = np.zeros(num_classes)  # Total words per class
        
        # Compute class priors and word counts
        for i, cls in enumerate(classes):
            class_indices = np.where(y == cls)[0]
            self.class_priors[i] = len(class_indices) / len(y)  # P(Class)
            
            for idx in class_indices:
                words = X[idx].split()
                for word in words:
                    if word in self.vocab:
                        word_index = self.vocab[word]
                        word_counts[i, word_index] += 1
                        class_counts[i] += 1
        
        # Apply Laplace Smoothing: P(Word | Class) = (word_count + alpha) / (total_words + alpha * V)
        self.word_probs = (word_counts + self.alpha) / (class_counts[:, None] + self.alpha * V)
    
    def predict(self, X):
        """
        Predict the class of new text samples.
        X: List of text samples (sentences).
        """
        predictions = []
        log_class_priors = np.log(self.class_priors)
        
        for text in X:
            words = text.split()
            log_probs = log_class_priors.copy()  # Initialize with log priors
            
            for word in words:
                if word in self.vocab:
                    word_index = self.vocab[word]
                    log_probs += np.log(self.word_probs[:, word_index])
            
            predictions.append(np.argmax(log_probs))  # Choose class with highest log probability
        
        return np.array(predictions)

In [11]:
shuffled_indices = np.random.permutation(len(data))  # Get shuffled indices
data = data.iloc[shuffled_indices].reset_index(drop=True)  # Shuffle data
train_size = int(0.8 * len(data))
X_train, y_train = data["sentence"][:train_size], np.array(data["label"][:train_size])
X_test, y_test = data["sentence"][train_size:], np.array(data["label"][train_size:])

X_train = [preprocess(text) for text in X_train]
X_test = [preprocess(text) for text in X_test]

In [12]:
nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)

# Predict on test data
y_pred = nb.predict(X_test)

# Evaluate accuracy
accuracy = np.mean(y_pred == y_test)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")


Test Accuracy: 0.5112
Training samples: 9484, Test samples: 2371


### Comparison with built in Naive Bayes

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

# Create a pipeline with CountVectorizer + MultinomialNB
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Train the model
model.fit(X_train, y_train)

# Predict using the trained model
y_pred_sklearn = model.predict(X_test)

# Evaluate accuracy
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
print(f"Scikit-Learn Naïve Bayes Accuracy: {accuracy_sklearn:.4f}")

 

Scikit-Learn Naïve Bayes Accuracy: 0.5086


## Part 2.2 - Logistic Regression

In [14]:
import random

class LogisticRegression:
    def __init__(self, learning_rate=0.01, epochs=1000, batch_size=32):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.weights = None
        self.bias = 0
        self.vocab = None  # Stores unique bi-grams
    
    def extract_bigrams(self, text):
        words = text.split()
        #takes every two consecutive words together as a bigram
        return [(words[i], words[i+1]) for i in range(len(words) - 1)]
    
    def fit(self, X, y): #X is a list of sentences, y is a list of labels

        vocab_set = set()
        for text in X:
            vocab_set.update(self.extract_bigrams(text)) # store in this set all unique bi-grams

        self.vocab = {bi_gram: i for i, bi_gram in enumerate(vocab_set)}#dictionary; index each unique bigram
        V = len(self.vocab) #vocab size

        # Convert sentences to feature vectors 1 if bigram is present in the sentence, 0 otherwise

        X_transformed = np.zeros((len(X), V)) #initialize a matrix of zeros
        for i, text in enumerate(X):
            bi_grams = self.extract_bigrams(text)
            for bi_gram in bi_grams:
                if bi_gram in self.vocab:
                    X_transformed[i, self.vocab[bi_gram]] = 1 #if bigram is present, set it to 1

        # Initialize weights and bias
        self.weights = np.zeros(V)
        self.bias = 0

        # Training with Mini-Batch Stochastic Gradient Descent
        for epoch in range(self.epochs):
            # Shuffle data each epoch
            indices = np.arange(len(y))
            np.random.shuffle(indices) # Shuffle indices
            X_shuffled, y_shuffled = X_transformed[indices], y[indices]#now the data is shuffled and still with its corresponding labels

            # Mini-batch updates
            for i in range(0, len(y), self.batch_size): #Iterate over the dataset in steps of batch_size.
                X_batch = X_shuffled[i:i + self.batch_size]
                y_batch = y_shuffled[i:i + self.batch_size]
                
                #Compute predictions
                logits = np.dot(X_batch, self.weights) + self.bias # di el heya wx+b
                predictions = 1 / (1 + np.exp(-logits))  # hena sigmoid activation function

                # Compute gradients
                error = predictions - y_batch # di heya predicted - actual
                dw = np.dot(X_batch.T, error) / len(y_batch) #Derivative of the loss w.r.t. weights.
                db = np.sum(error) / len(y_batch)  #Derivative of the loss w.r.t. bias
                
                #Updating weights and bias
                self.weights -= self.learning_rate * dw #w=w-alpha*dw
                self.bias -= self.learning_rate * db #b=b-alpha*db

            # Print loss every 100 epochs
            # if epoch % 100 == 0:
            #     loss = -np.mean(y_batch * np.log(predictions + 1e-9) + (1 - y_batch) * np.log(1 - predictions + 1e-9))
            #     print(f"Epoch {epoch}, Loss: {loss:.4f}")

    def predict(self, X):
        """Predict class labels for new sentences."""
        X_transformed = np.zeros((len(X), len(self.vocab)))

        for i, text in enumerate(X):
            bi_grams = self.extract_bigrams(text)
            for bi_gram in bi_grams:
                if bi_gram in self.vocab:
                    X_transformed[i, self.vocab[bi_gram]] = 1

        # Compute predictions
        logits = np.dot(X_transformed, self.weights) + self.bias
        probabilities = 1 / (1 + np.exp(-logits)) #sigmoid activation function
        return (probabilities >= 0.5).astype(int)  # Return binary predictions (0 or 1)

In [15]:
#logistic regression is binary classifier so we need to convert the labels to 0 and 1
#so i did 0-1 -> 0 and 3-4 -> 1 and removed the neutral class 2

data = data[data["label"] != 2]  # Remove neutral class
data["binary_label"] = data["label"].apply(lambda x: 1 if x > 2 else 0)

# Split into features and labels
X = data["sentence"].values # Text sentences
y = data["binary_label"].values# Binary sentiment labels

In [16]:
from sklearn.model_selection import train_test_split

# Split into train (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [17]:
model = LogisticRegression(learning_rate=0.01, epochs=100, batch_size=32)
model.fit(X_train, y_train)

In [18]:
y_pred_scratch_logistic = model.predict(X_test)

def accuracy_score(y_true, y_pred_scratch_logistic):
    return np.mean(y_true == y_pred_scratch_logistic) * 100

accuracy = accuracy_score(y_test, y_pred_scratch_logistic)
print(f"Test Accuracy: {accuracy:.2f}%")

Test Accuracy: 50.93%


### Comparison with scikit-learn Logistic Regression

In [19]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score

import numpy as np

# Create an instance of your logistic regression model
model = LogisticRegression()

# Extract vocabulary from training data
vocab_set = set()
for text in X_train:
    vocab_set.update(model.extract_bigrams(text))

model.vocab = {bi_gram: i for i, bi_gram in enumerate(vocab_set)}

V = len(model.vocab)  # Vocabulary size

# this Function does exactly the same thing as the one in the class but yeahhhhhhh
#had to do this so i can give the built in function the same input as the one in the class
def transform_sentences(X, vocab):
    X_transformed = np.zeros((len(X), len(vocab)))
    for i, text in enumerate(X):
        bi_grams = model.extract_bigrams(text)
        for bi_gram in bi_grams:
            if bi_gram in vocab:
                X_transformed[i, vocab[bi_gram]] = 1
    return X_transformed

# Convert X_train and X_test to feature vectors
X_train_transformed = transform_sentences(X_train, model.vocab)
X_test_transformed = transform_sentences(X_test, model.vocab)


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize and train scikit-learn's logistic regression
sklearn_lr = LogisticRegression(solver="lbfgs", max_iter=100)
sklearn_lr.fit(X_train_transformed, y_train)  # Use transformed features

# Make predictions
y_pred_sklearn = sklearn_lr.predict(X_test_transformed)

# Compute accuracy
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
print(f"Scikit-Learn Logistic Regression Accuracy: {accuracy_sklearn * 100:.2f}%")

Scikit-Learn Logistic Regression Accuracy: 52.36%


### Comparison with scikit-learn SGD Classifier

In [21]:
from sklearn.linear_model import SGDClassifier

# Initialize and train SGDClassifier as logistic regression
sgd_lr = SGDClassifier(loss="log_loss", max_iter=100, learning_rate="optimal")
sgd_lr.fit(X_train_transformed, y_train)

# Make predictions
y_pred_sgd = sgd_lr.predict(X_test_transformed)

# Compute accuracy
accuracy_sgd = accuracy_score(y_test, y_pred_sgd)
print(f"SGDClassifier Accuracy: {accuracy_sgd * 100:.2f}%")

SGDClassifier Accuracy: 51.91%


## Part 2.3 Confusion Matrix & Evaluation Metrics

In [22]:
import numpy as np

def confusion_matrix(y_true, y_pred, num_classes):

    matrix = np.zeros((num_classes, num_classes), dtype=int)

    for true, pred in zip(y_true, y_pred):
        matrix[true, pred] += 1  # Increment the respective position
    
    return matrix

def compute_metrics(conf_matrix):
    
    num_classes = conf_matrix.shape[0]
    
    precision = []
    recall = []
    f1_score = []

    for i in range(num_classes):
        TP = conf_matrix[i, i]  # True Positives
        FP = sum(conf_matrix[:, i]) - TP  # False Positives
        FN = sum(conf_matrix[i, :]) - TP  # False Negatives
        
        prec = TP / (TP + FP) if (TP + FP) > 0 else 0
        rec = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0

        precision.append(prec)
        recall.append(rec)
        f1_score.append(f1)

    macro_precision = np.mean(precision)
    macro_recall = np.mean(recall)
    macro_f1 = np.mean(f1_score)

    return precision, recall, f1_score, macro_precision, macro_recall, macro_f1


In [23]:
conf_matrix = confusion_matrix(y_test,y_pred_scratch_logistic, 2)
print("Confusion Matrix:\n", conf_matrix)

precision, recall, f1_score, macro_prec, macro_rec, macro_f1 = compute_metrics(conf_matrix)

print("\nPer-class Precision:", [f"{p:.2f}" for p in precision])
print("Per-class Recall:", [f"{r:.2f}" for r in recall])
print("Per-class F1 Score:", [f"{f:.2f}" for f in f1_score])

print("\nMacro-averaged Precision:", f"{macro_prec:.4f}")
print("Macro-averaged Recall:", f"{macro_rec:.4f}")
print("Macro-averaged F1 Score:", f"{macro_f1:.4f}")


Confusion Matrix:
 [[290 267]
 [284 282]]

Per-class Precision: ['0.51', '0.51']
Per-class Recall: ['0.52', '0.50']
Per-class F1 Score: ['0.51', '0.51']

Macro-averaged Precision: 0.5094
Macro-averaged Recall: 0.5094
Macro-averaged F1 Score: 0.5093


### Comparison with sk-learn metrics

In [None]:
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

sk_conf_matrix = sk_confusion_matrix(y_test,y_pred_sklearn)
print("\nConfusion Matrix (Scikit-learn):\n", sk_conf_matrix)

sk_precision = precision_score( y_test,y_pred_sklearn, average=None)
sk_recall = recall_score( y_test,y_pred_sklearn, average=None)
sk_f1_score = f1_score( y_test,y_pred_sklearn, average=None)

sk_macro_precision = precision_score( y_test,y_pred_sklearn, average="macro")
sk_macro_recall = recall_score( y_test,y_pred_sklearn, average="macro")
sk_macro_f1 = f1_score( y_test,y_pred_sklearn, average="macro")

print(f"Per-Class Precision (Sklearn): {[round(i,2) for i in sk_precision.tolist()]}")
print(f"Per-Class Recall (Sklearn): {[round(i,2) for i in sk_recall.tolist()]}")
print(f"Per-Class F1 Score (Sklearn): {[round(i,2) for i in sk_f1_score.tolist()]}")

print("\nMacro-Averaged Metrics:")
print(f"Sklearn: {sk_macro_precision:.4f}")
print(f"Sklearn: {sk_macro_recall:.4f}")
print(f"Sklearn: {sk_macro_f1:.4f}")



Confusion Matrix (Scikit-learn):
 [[293 264]
 [271 295]]
Per-Class Precision (Sklearn): [0.52, 0.53]
Per-Class Recall (Sklearn): [0.53, 0.52]
Per-Class F1 Score (Sklearn): [0.52, 0.52]

Macro-Averaged Metrics:
Sklearn: 0.5236
Sklearn: 0.5236
Sklearn: 0.5236


In [25]:
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

sk_conf_matrix = sk_confusion_matrix(y_test,y_pred_sgd)
print("\nConfusion Matrix (Scikit-learn):\n", sk_conf_matrix)

sk_precision = precision_score( y_test,y_pred_sgd, average=None)
sk_recall = recall_score( y_test,y_pred_sgd, average=None)
sk_f1_score = f1_score( y_test,y_pred_sgd, average=None)

sk_macro_precision = precision_score( y_test,y_pred_sgd, average="macro")
sk_macro_recall = recall_score( y_test,y_pred_sgd, average="macro")
sk_macro_f1 = f1_score( y_test,y_pred_sgd, average="macro")

print(f"Per-Class Precision (Sklearn): {[round(i,2) for i in sk_precision.tolist()]}")
print(f"Per-Class Recall (Sklearn): {[round(i,2) for i in sk_recall.tolist()]}")
print(f"Per-Class F1 Score (Sklearn): {[round(i,2) for i in sk_f1_score.tolist()]}")

print("\nMacro-Averaged Metrics:")
print(f"Sklearn: {sk_macro_precision:.4f}")
print(f"Sklearn: {sk_macro_recall:.4f}")
print(f"Sklearn: {sk_macro_f1:.4f}")



Confusion Matrix (Scikit-learn):
 [[290 267]
 [273 293]]
Per-Class Precision (Sklearn): [0.52, 0.52]
Per-Class Recall (Sklearn): [0.52, 0.52]
Per-Class F1 Score (Sklearn): [0.52, 0.52]

Macro-Averaged Metrics:
Sklearn: 0.5192
Sklearn: 0.5192
Sklearn: 0.5191


#### Our from-scratch implementation is more comparable to the SGD classifier because SGD updates the weights after every sample instead of the entire dataset 

#### Our implementation uses mini batches unlike the logisticRegression which uses the entire Dataset.