## Assignment 1

In [202]:
import numpy as np
from datasets import load_dataset,concatenate_datasets

### Loading The Data

In [203]:
dataset = load_dataset('sst', trust_remote_code=True)

In [204]:
dataset['test'][10]

{'sentence': 'What really surprises about Wisegirls is its low-key quality and genuine tenderness .',
 'label': 0.625,
 'tokens': 'What|really|surprises|about|Wisegirls|is|its|low-key|quality|and|genuine|tenderness|.',
 'tree': '25|24|22|21|21|19|16|15|15|17|14|14|20|18|16|17|18|19|20|23|22|23|24|25|0'}

### Preprocessing

In [205]:
def map_class(value):
    return np.digitize(value, bins=[0.2, 0.4, 0.6, 0.8], right=False)

In [206]:
dataset_mapped = dataset.map(lambda example: {'label': map_class(example['label'])})

In [207]:
# Access the training set
train_data = concatenate_datasets([dataset_mapped['train'], dataset_mapped['validation']])
train_labels = np.array(train_data['label'],dtype=np.int8)

# Access the test set
test_data = dataset_mapped['test']
test_labels = np.array(test_data['label'],dtype=np.int8)

num_classes = 5

In [208]:
x = test_data[10]
x

{'sentence': 'What really surprises about Wisegirls is its low-key quality and genuine tenderness .',
 'label': 3.0,
 'tokens': 'What|really|surprises|about|Wisegirls|is|its|low-key|quality|and|genuine|tenderness|.',
 'tree': '25|24|22|21|21|19|16|15|15|17|14|14|20|18|16|17|18|19|20|23|22|23|24|25|0'}

### Part 1: Naïve Bayes 

#### Algorithm Implementation

In [209]:
def train_naive_bayes(D,num_classes):
     
    n_doc = len(D)  # Total number of documents/examples
    cls_prior = np.zeros(num_classes)  # Initialize class prior probabilities
    vocab = set()  # To store the vocabulary
    cls_word_cnt = np.array([{} for _ in range(num_classes)])  # Initialize class word counts

    # Loop through each example in the dataset
    for example in D:
        example_class = int(example["label"])  # Get the class label for the example
        cls_prior[example_class] += 1  # Count the number of documents for each class
        for word in example["tokens"].split("|"):  # Tokenize the example
            # Update word count for the corresponding class
            cls_word_cnt[example_class][word] = cls_word_cnt[example_class].get(word, 0) + 1
            vocab.add(word)  # Add word to the vocabulary
           
    cls_prior /= n_doc  # Calculate prior probabilities
    log_prior = np.log(cls_prior)  # Convert prior probabilities to log space
    
    vocab_size = len(vocab)
    cls_total_word_cnt = np.array([sum(cls_dict.values()) for cls_dict in cls_word_cnt])  # Total word count for each class
    
    # Calculate log likelihood with smoothing (Laplace smoothing)
    log_likelihood = np.array([
        {word: np.log((cls_dict.get(word,0) + 1) / (cls_total_word_cnt[i] + vocab_size)) for word in vocab} 
        for i, cls_dict in enumerate(cls_word_cnt)
    ])
    
    return log_prior, log_likelihood, vocab

def test_naive_bayes(test_doc,log_prior, log_likelihood,num_classes,vocab):
    sum_lg = np.zeros(num_classes)
    for c in range(num_classes):
        sum_lg[c] = log_prior[c]
        for word in test_doc.split():
            if word in vocab:
                sum_lg[c] += log_likelihood[c][word]
                
    return np.argmax(sum_lg)

In [210]:
log_prior, log_likelihood, vocab = train_naive_bayes(train_data,num_classes)

In [211]:
predictions = []
for doc in test_data:
    predictions.append(test_naive_bayes(doc["sentence"],log_prior, log_likelihood, num_classes, vocab))
    
predictions = np.array(predictions)

# Calculate the number of correct predictions
correct_predictions = np.sum(predictions == test_labels)
    
# Calculate accuracy
my_accuracy = correct_predictions / len(test_labels)

In [212]:
print(my_accuracy)

0.3995475113122172


#### Comparison with scikit learn

In [213]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()), 
    ('classifier', MultinomialNB())
])

# Fit the pipeline to the training data
pipeline.fit(train_data["sentence"], train_labels)

# Make predictions on the test data
y_pred = pipeline.predict(test_data["sentence"])

# Evaluate
print("Accuracy:", accuracy_score(test_labels, y_pred))

Accuracy: 0.40497737556561086


### Part 2: Logistic Regression

#### Feature Representation

In [214]:
def get_bigrams_cnt(D):
    
    bigrams = {}
    for example in D:
        words_list = example["tokens"].split("|")
        for i in range(len(words_list)-1):
            bigram = words_list[i] + " " + words_list[i+1]
            if bigram not in bigrams:
                bigrams[bigram] = len(bigrams)
    
    return bigrams
                
def generate_feat(D,bigrams,num_classes):
    test_data_feat = np.zeros((len(D),len(bigrams)+1),dtype=np.int8)
    test_data_labels = np.zeros((len(D),num_classes),dtype=np.int8)
    for i,example in enumerate(D):
        test_data_labels[i][int(example["label"])] = 1
        words_list = example["tokens"].split("|")
        for j in range(len(words_list)-1):
            bigram = words_list[j] + " " + words_list[j+1]
            if bigram in bigrams:
                test_data_feat[i][bigrams[bigram]] = 1
            
        
    return test_data_feat ,test_data_labels     


In [215]:
bigrams = get_bigrams_cnt(train_data)
X,Y = generate_feat(train_data,bigrams,num_classes)
print(Y.shape)
print(len(X))
print(Y)

(9645, 5)
9645
[[0 0 0 1 0]
 [0 0 0 0 1]
 [0 0 0 1 0]
 ...
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 1 0 0 0]]


#### Algorithm Implementation

In [218]:
def create_batches(X, Y, batch_size):
    for i in range(0, len(X), batch_size):
        batch_X = X[i:min(i + batch_size,len(X))]
        batch_Y = Y[i:min(i + batch_size,len(X))]
        yield batch_X, batch_Y  # Yielding both data and labels

def softmax(X:np.ndarray,W:np.ndarray,B:np.ndarray):  
    W_exp = np.exp(X @ W + B )   #N*f x f*c = N*c
    W_exp_sum = np.sum(W_exp,axis = 1).reshape(-1,1)

    return W_exp/W_exp_sum

def loss(X:np.ndarray,Y:np.ndarray,W:np.ndarray,B:np.ndarray):
    soft_max = softmax(X,W,B)
    req_ind = np.argmax(Y, axis=1)  # Row indices of non-zero elements
    req_values = np.log(soft_max[np.arange(soft_max.shape[0]),req_ind])
   
    return -1/X.shape[0]*np.sum(req_values)
    
def calculate_gradient(X:np.ndarray,Y:np.ndarray,W:np.ndarray,B:np.ndarray,num_classes,mu = 0.01):
    derivative = Y * (Y - softmax(X,W,B))
    
    # Identify the indices of non-zero entries in each column
    non_zero_cols = np.argmax(Y, axis=1)  # cols indices of non-zero elements
    non_zero_values = derivative[np.arange(derivative.shape[0]),non_zero_cols]  # Non-zero values in derivative
    
    # Vectorized calculation of dW
    dW = np.zeros((W.shape[1],W.shape[0]))
    np.add.at(dW, non_zero_cols, non_zero_values[:, np.newaxis] * X)  # Efficient col-wise update
    dW = -1/X.shape[0] * dW.T
    #dW += 2 * mu * W   #regularization term
    
    # Calculate dB directly
    dB = -1/X.shape[0]*np.sum(derivative, axis=0)
    
    return  dW, dB.reshape(1, num_classes)
          
def logistic_regression(X:np.ndarray,Y:np.ndarray,num_classes,lr=0.0001,max_itr=100,batch_size=15):
    num_examples, num_features = X.shape
    W = np.zeros((num_features,num_classes))
    B = np.zeros((1,num_classes))
    itr = 0

    while itr < max_itr:
        print(f"itr{itr+1}:")
        print("=============================================================================")
        for batch_X, batch_Y in create_batches(X, Y, batch_size):
            
            dW, dB = calculate_gradient(batch_X,batch_Y,W,B,num_classes)  
            W -= lr * dW
            B -= lr * dB
        
            print(f"The loss = {loss(batch_X,batch_Y,W,B)}")
        
        print("=============================================================================")

        itr+=1
        
    return W,B

In [None]:
logistic_regression(X,Y,num_classes,lr=0.01,max_itr=10,batch_size=1)

#### Comparison with scikit learn