<a href="https://colab.research.google.com/github/Mohamedragih1/Stanford-Sentiment-Treebank-SST-Text-Classification/blob/main/Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data loading

In [None]:
#Import required libraries
import numpy as np
from datasets import load_dataset
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

In [None]:
#Load data
dataset = load_dataset("sst")

#Preprocessing

In [3]:
# Categorizing these text samples into five distinct classes
def mapClasses(value):
  if value <= 0.2:
    return 0        # Very Negative
  elif value <= 0.4:
    return 1        # Negative
  elif value <= 0.6:
    return 2        # Neutral
  elif value <= 0.8:
    return 3        # Positive
  else:
    return 4        # very Positive

In [4]:
# Mapping datset samples
dataset = dataset.map(lambda example: {'text': example['sentence'], 'label': mapClasses(example['label'])})

Map:   0%|          | 0/8544 [00:00<?, ? examples/s]

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2210 [00:00<?, ? examples/s]

In [5]:
#extract train, validation and test data
train_data = dataset['train']
val_data = dataset['validation']
test_data = dataset['test']

# Feature generation functions

In [6]:
# tokenizing sentences bby spliting
def tokenize(sentence):
  return sentence.split()

# Get all possible bigrams from given sentence
def generateBigrams(sentence):
  words = tokenize(sentence)
  return [(words[i + 1], words[i]) for i in range(len(words)-1)]

# Generating features for the training set
def trainFeatures(dataset):
  total_bigrams = set()
  for example in dataset:
        bigrams = generateBigrams(example['sentence'])
        total_bigrams.update(bigrams)

  bi_gram_to_index = {bi_gram: index for index, bi_gram in enumerate(total_bigrams)}

  features = np.zeros((len(dataset), len(total_bigrams)))

  for i, example in enumerate(dataset):
    bigrams = generateBigrams(example['sentence'])
    for bi_gram in bigrams:
      index = bi_gram_to_index.get(bi_gram)
      if index is not None:
        features[i, index] = 1

    return features, bi_gram_to_index, total_bigrams

# Generating features for validation and test sets
def nonTrainFeatures(dataset, total_bigrams):
    bi_gram_to_index = {bi_gram: index for index, bi_gram in enumerate(total_bigrams)}

    features = np.zeros((len(dataset), len(total_bigrams)))

    for i, example in enumerate(dataset):
        bigrams = generateBigrams(example['sentence'])
        for bi_gram in bigrams:
            index = bi_gram_to_index.get(bi_gram)
            if index is not None:
                features[i, index] = 1

    return features, bi_gram_to_index

# Features Representaion

In [7]:
train_features, _, bigrams = trainFeatures(train_data)

In [8]:
val_features, _ = nonTrainFeatures(val_data, bigrams)
test_features, _ = nonTrainFeatures(test_data, bigrams)

In [9]:
print(train_features.shape)
print(val_features.shape)
print(test_features.shape)

(8544, 87249)
(1101, 87249)
(2210, 87249)


In [10]:
# Extracting labels
train_labels = np.array([mapClasses(example['label']) for example in train_data])
val_labels = np.array([mapClasses(example['label']) for example in val_data])
test_labels = np.array([mapClasses(example['label']) for example in test_data])

In [11]:
print(train_labels.shape)
print(val_labels.shape)
print(test_labels.shape)

(8544,)
(1101,)
(2210,)


In [12]:
# converting features from float32 to int16
train_features = np.array(train_features,dtype = np.int16)
val_features = np.array(val_features,dtype = np.int16)
test_features = np.array(test_features,dtype = np.int16)

#Model building

In [13]:
# softmax function to return probability of each class
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

In [14]:
class LogisticRegression:
    def __init__(self, learning_rate, num_epochs, batch_size):
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.weights = None
        self.bias = None
        self.num_classes = 5

    def fit(self, X, y):
      n_samples, n_features = X.shape

      self.weights = np.zeros((n_features, self.num_classes))
      self.bias = np.zeros(self.num_classes)

      for epoch in range(self.num_epochs):
          X, y = shuffle(X, y)
          for start in range(0, n_samples, self.batch_size):
            end = min(start + self.batch_size, n_samples)
            X_batch = X[start:end]
            y_batch = y[start:end]
            scores = np.dot(X_batch, self.weights) + self.bias
            outputs = softmax(scores)

            # one hot encoding
            y_encoded = np.eye(self.num_classes)[y_batch]
            error = outputs - y_encoded

            dw = (1 / n_samples) * np.dot(X_batch.T, error)
            db = (1 / n_samples) * np.sum(error, axis=0)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        scores = np.dot(X, self.weights) + self.bias
        outputs = softmax(scores)
        return outputs

# Model training

In [15]:
model = LogisticRegression(learning_rate=0.01, num_epochs=100, batch_size = 32)
model.fit(train_features, train_labels)

In [16]:
predictions = model.predict(val_features)
predicted_labels = np.argmax(predictions, axis=1)
print(f" Accuracy : {accuracy_score(val_labels, predicted_labels)*100}")

 Accuracy : 87.37511353315168


In [17]:
predictions = model.predict(test_features)
predicted_labels = np.argmax(predictions, axis=1)
print(f" Accuracy : {accuracy_score(test_labels, predicted_labels)*100}")

 Accuracy : 87.37556561085972


#SK-learn Model

In [18]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state = 42)
model.fit(train_features, train_labels)

In [21]:
predictions = model.predict(val_features)
print(f" Accuracy : {accuracy_score(val_labels, predictions)*100}")

 Accuracy : 87.37511353315168


In [22]:
predictions = model.predict(test_features)
print(f" Accuracy : {accuracy_score(test_labels, predictions)*100}")

 Accuracy : 87.37556561085972
