## Assignment 1

In [1]:
import numpy as np
from datasets import load_dataset,concatenate_datasets

### Loading The Data

In [2]:
dataset = load_dataset('sst', trust_remote_code=True)

In [3]:
dataset['test'][10]

{'sentence': 'What really surprises about Wisegirls is its low-key quality and genuine tenderness .',
 'label': 0.625,
 'tokens': 'What|really|surprises|about|Wisegirls|is|its|low-key|quality|and|genuine|tenderness|.',
 'tree': '25|24|22|21|21|19|16|15|15|17|14|14|20|18|16|17|18|19|20|23|22|23|24|25|0'}

### Preprocessing

In [4]:
def map_class(value):
    return np.digitize(value, bins=[0.2, 0.4, 0.6, 0.8], right=False)

In [5]:
dataset_mapped = dataset.map(lambda example: {'label': map_class(example['label'])})

In [6]:
# Access the training set
train_data = concatenate_datasets([dataset_mapped['train'], dataset_mapped['validation']])
train_labels = np.array(train_data['label'],dtype=np.int8)

# Access the test set
test_data = dataset_mapped['test']
test_labels = np.array(test_data['label'],dtype=np.int8)

num_classes = 5

In [7]:
x = test_data[10]
x

{'sentence': 'What really surprises about Wisegirls is its low-key quality and genuine tenderness .',
 'label': 3.0,
 'tokens': 'What|really|surprises|about|Wisegirls|is|its|low-key|quality|and|genuine|tenderness|.',
 'tree': '25|24|22|21|21|19|16|15|15|17|14|14|20|18|16|17|18|19|20|23|22|23|24|25|0'}

### Part 1: Naïve Bayes 

#### Algorithm Implementation

In [8]:
def train_naive_bayes(D,num_classes):
     
    n_doc = len(D)  # Total number of documents/examples
    cls_prior = np.zeros(num_classes)  # Initialize class prior probabilities
    vocab = set()  # To store the vocabulary
    cls_word_cnt = np.array([{} for _ in range(num_classes)])  # Initialize class word counts

    # Loop through each example in the dataset
    for example in D:
        example_class = int(example["label"])  # Get the class label for the example
        cls_prior[example_class] += 1  # Count the number of documents for each class
        for word in example["tokens"].split("|"):  # Tokenize the example
            # Update word count for the corresponding class
            cls_word_cnt[example_class][word] = cls_word_cnt[example_class].get(word, 0) + 1
            vocab.add(word)  # Add word to the vocabulary
           
    cls_prior /= n_doc  # Calculate prior probabilities
    log_prior = np.log(cls_prior)  # Convert prior probabilities to log space
    
    vocab_size = len(vocab)
    cls_total_word_cnt = np.array([sum(cls_dict.values()) for cls_dict in cls_word_cnt])  # Total word count for each class
    
    # Calculate log likelihood with smoothing (Laplace smoothing)
    log_likelihood = np.array([
        {word: np.log((cls_dict.get(word,0) + 1) / (cls_total_word_cnt[i] + vocab_size)) for word in vocab} 
        for i, cls_dict in enumerate(cls_word_cnt)
    ])
    
    return log_prior, log_likelihood, vocab

def test_naive_bayes(test_doc,log_prior, log_likelihood,num_classes,vocab):
    sum_lg = np.zeros(num_classes)
    for c in range(num_classes):
        sum_lg[c] = log_prior[c]
        for word in test_doc.split():
            if word in vocab:
                sum_lg[c] += log_likelihood[c][word]
                
    return np.argmax(sum_lg)

In [9]:
log_prior, log_likelihood, vocab = train_naive_bayes(train_data,num_classes)

In [10]:
predictions = []
for doc in test_data:
    predictions.append(test_naive_bayes(doc["sentence"],log_prior, log_likelihood, num_classes, vocab))
    
predictions = np.array(predictions)

# Calculate the number of correct predictions
correct_predictions = np.sum(predictions == test_labels)
    
# Calculate accuracy
my_accuracy = correct_predictions / len(test_labels)

In [11]:
print(my_accuracy)

0.3995475113122172


#### Comparison with scikit learn

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()), 
    ('classifier', MultinomialNB())
])

# Fit the pipeline to the training data
pipeline.fit(train_data["sentence"], train_labels)

# Make predictions on the test data
y_pred = pipeline.predict(test_data["sentence"])

# Evaluate
print("Accuracy:", accuracy_score(test_labels, y_pred))

Accuracy: 0.40497737556561086


### Part 2: Logistic Regression

#### Feature Representation

In [13]:
def get_bigrams_cnt(D):
    
    bigrams = {}
    for example in D:
        words_list = example["tokens"].split("|")
        for i in range(len(words_list)-1):
            bigram = words_list[i] + " " + words_list[i+1]
            if bigram not in bigrams:
                bigrams[bigram] = len(bigrams)
    
    return bigrams
                
def generate_feat(D,bigrams,num_classes):
    test_data_feat = np.zeros((len(D),len(bigrams)+1),dtype=np.int8)
    test_data_labels = np.zeros((len(D),num_classes),dtype=np.int8)
    for i,example in enumerate(D):
        test_data_labels[i][int(example["label"])] = 1
        words_list = example["tokens"].split("|")
        for j in range(len(words_list)-1):
            bigram = words_list[j] + " " + words_list[j+1]
            test_data_feat[i][bigrams[bigram]] = 1
            
        
    return test_data_feat ,test_data_labels     


In [14]:
bigrams = get_bigrams_cnt(train_data)
X,Y = generate_feat(train_data,bigrams,num_classes)
print(Y)

[[0 0 0 1 0]
 [0 0 0 0 1]
 [0 0 0 1 0]
 ...
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 1 0 0 0]]


#### Algorithm Implementation

In [18]:
def loss(X_T:np.ndarray,Y_T:np.ndarray,W:np.ndarray,B:np.ndarray):
    soft_max = softmax(W,X_T,B)
    req_ind = np.argmax(Y_T, axis=0)  # Row indices of non-zero elements
    req_values = np.log(soft_max[req_ind, np.arange(soft_max.shape[1])])
    #print(f" max value of log = {max(req_values)}")
    return -1/X_T.shape[1]*np.sum(req_values)

def softmax(W:np.ndarray,X_T:np.ndarray,B:np.ndarray):
    Z = W @ X_T + B
    W_exp = np.exp(Z)
    W_exp_sum = np.sum(W_exp,axis = 0) 

    #print("max in Z", np.max(Z)) 
    return W_exp/W_exp_sum
    
def calculate_gradient(X:np.ndarray,X_T:np.ndarray,Y_T:np.ndarray,W:np.ndarray,B:np.ndarray):
    derivative = Y_T * (1 - softmax(W,X_T,B))
    
    # Identify the indices of non-zero entries in each column
    non_zero_rows = np.argmax(Y_T, axis=0)  # Row indices of non-zero elements
    non_zero_values = derivative[non_zero_rows, np.arange(derivative.shape[1])]  # Non-zero values in derivative
    
    # Vectorized calculation of dW
    dW = np.zeros((W.shape[0],W.shape[1]))
    np.add.at(dW, non_zero_rows, -non_zero_values[:, np.newaxis] * X)  # Efficient row-wise update
    # Calculate dB directly
    dB = -1/X.shape[0]*np.sum(derivative, axis=1)
    dW *= -1/X.shape[0]
     
    return  dW, dB.reshape(5, 1)

# Function to create batches for data (X) and labels (Y)
def create_batches(X, Y, batch_size):
    for i in range(0, len(X), batch_size):
        batch_X = X[i:i + batch_size]
        batch_Y = Y[i:i + batch_size]
        yield batch_X, batch_Y  # Yielding both data and labels
          
def logistic_regression(X:np.ndarray,Y:np.ndarray,num_classes,lr=0.0001,max_itr=100,batch_size=15):
    num_examples, num_features = X.shape
    W = np.zeros((num_classes,num_features))
    B = np.zeros((num_classes,1))
    itr = 0
    # X_T,Y_T = X.T, Y.T

    while itr < max_itr:
        print(f"itr{itr+1}:")
        print("=============================================================================")
        for batch_X, batch_Y in create_batches(X, Y, batch_size):
            X_T, Y_T = batch_X.T, batch_Y.T
            dW, dB = calculate_gradient(batch_X,X_T,Y_T,W,B)  
            W -= lr * dW
            B -= lr * dB
        
            print(f"The loss = {loss(X_T,Y_T,W,B)}")
        
        itr+=1
        print("=============================================================================")
        
    return W,B

In [19]:
logistic_regression(X,Y,num_classes,lr=0.0001,max_itr=10,batch_size=15)

itr1:
The loss = 1.6095098778720265
The loss = 1.6094874063959146
The loss = 1.6094943756886648
The loss = 1.60952018939458
The loss = 1.609484705167308
The loss = 1.6094480834131357
The loss = 1.6094232667415906
The loss = 1.6093827342889038
The loss = 1.6094453136696605
The loss = 1.609365528969633
The loss = 1.6093314704177852
The loss = 1.6093589938567703
The loss = 1.6093727152166553
The loss = 1.6093213804819697
The loss = 1.6093091548227114
The loss = 1.6093494028410946
The loss = 1.6093157721965425
The loss = 1.6092107509102487
The loss = 1.6093516865517787
The loss = 1.6092618855373673
The loss = 1.6093052615567338
The loss = 1.6092404143910333
The loss = 1.6091466350608181
The loss = 1.609157733537934
The loss = 1.6090848511749996
The loss = 1.6091880341221958
The loss = 1.609274861789502
The loss = 1.6091551923574337
The loss = 1.6091699210740424
The loss = 1.6091465198224693
The loss = 1.6092820650908386
The loss = 1.6092140945619557
The loss = 1.6090801351759094
The loss =

(array([[-1.07208155e-04,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [-5.31855614e-05,  0.00000000e+00,  0.00000000e+00, ...,
         -5.31261352e-05, -5.31261352e-05,  0.00000000e+00],
        [-1.06740006e-04,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [-1.06183469e-04, -5.31378972e-05, -5.31378972e-05, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [-5.34638088e-05,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00]]),
 array([[0.06597321],
        [0.1332845 ],
        [0.09889425],
        [0.13812035],
        [0.0777031 ]]))

#### Comparison with scikit learn