## Assignment 1

In [1]:
import numpy as np
from datasets import load_dataset,concatenate_datasets

### Loading The Data

In [2]:
dataset = load_dataset('sst', trust_remote_code=True)

In [3]:
dataset['test'][10]

{'sentence': 'What really surprises about Wisegirls is its low-key quality and genuine tenderness .',
 'label': 0.625,
 'tokens': 'What|really|surprises|about|Wisegirls|is|its|low-key|quality|and|genuine|tenderness|.',
 'tree': '25|24|22|21|21|19|16|15|15|17|14|14|20|18|16|17|18|19|20|23|22|23|24|25|0'}

### Preprocessing

In [4]:
def map_class(value):
    return np.digitize(value, bins=[0.2, 0.4, 0.6, 0.8], right=False)

In [5]:
dataset_mapped = dataset.map(lambda example: {'label': map_class(example['label'])})

In [6]:
# Access the training set
train_data = concatenate_datasets([dataset_mapped['train'], dataset_mapped['validation']])
train_labels = np.array(train_data['label'],dtype=np.int32)

# Access the test set
test_data = dataset_mapped['test']
test_labels = np.array(test_data['label'],dtype=np.int32)

In [7]:
x = test_data[10]
x

{'sentence': 'What really surprises about Wisegirls is its low-key quality and genuine tenderness .',
 'label': 3.0,
 'tokens': 'What|really|surprises|about|Wisegirls|is|its|low-key|quality|and|genuine|tenderness|.',
 'tree': '25|24|22|21|21|19|16|15|15|17|14|14|20|18|16|17|18|19|20|23|22|23|24|25|0'}

### Part 1: Naïve Bayes 

#### Algorithm Implementation

In [8]:
def train_naive_bayes(D,num_classes):
     
    n_doc = len(D)  # Total number of documents/examples
    cls_prior = np.zeros(num_classes)  # Initialize class prior probabilities
    vocab = set()  # To store the vocabulary
    cls_word_cnt = np.array([{} for _ in range(num_classes)])  # Initialize class word counts

    # Loop through each example in the dataset
    for example in D:
        example_class = int(example["label"])  # Get the class label for the example
        cls_prior[example_class] += 1  # Count the number of documents for each class
        for word in example["tokens"].split("|"):  # Tokenize the example
            # Update word count for the corresponding class
            cls_word_cnt[example_class][word] = cls_word_cnt[example_class].get(word, 0) + 1
            vocab.add(word)  # Add word to the vocabulary
           
    cls_prior /= n_doc  # Calculate prior probabilities
    log_prior = np.log(cls_prior)  # Convert prior probabilities to log space
    
    vocab_size = len(vocab)
    cls_total_word_cnt = np.array([sum(cls_dict.values()) for cls_dict in cls_word_cnt])  # Total word count for each class
    
    # Calculate log likelihood with smoothing (Laplace smoothing)
    log_likelihood = np.array([
        {word: np.log((cls_dict.get(word,0) + 1) / (cls_total_word_cnt[i] + vocab_size)) for word in vocab} 
        for i, cls_dict in enumerate(cls_word_cnt)
    ])
    
    return log_prior, log_likelihood, vocab

def test_naive_bayes(test_doc,log_prior, log_likelihood,num_classes,vocab):
    sum_lg = np.zeros(num_classes)
    for c in range(num_classes):
        sum_lg[c] = log_prior[c]
        for word in test_doc.split():
            if word in vocab:
                sum_lg[c] += log_likelihood[c][word]
                
    return np.argmax(sum_lg)

In [9]:
num_classes = 5
log_prior, log_likelihood, vocab = train_naive_bayes(train_data,num_classes)

In [10]:
predictions = []
for doc in test_data:
    predictions.append(test_naive_bayes(doc["sentence"],log_prior, log_likelihood, num_classes, vocab))
    
predictions = np.array(predictions)

# Calculate the number of correct predictions
correct_predictions = np.sum(predictions == test_labels)
    
# Calculate accuracy
my_accuracy = correct_predictions / len(test_labels)

In [11]:
print(my_accuracy)

0.3995475113122172


#### Comparison with scikit learn

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


# Define the pipeline
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()), 
    ('classifier', MultinomialNB())
])

# Fit the pipeline to the training data
pipeline.fit(train_data["sentence"], train_labels)

# Make predictions on the test data
y_pred = pipeline.predict(test_data["sentence"])

# Evaluate
print("Accuracy:", accuracy_score(test_labels, y_pred))
print("Classification Report:\n", classification_report(test_labels, y_pred))

Accuracy: 0.40497737556561086
Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.09      0.15       279
           1       0.46      0.65      0.54       633
           2       0.21      0.09      0.13       389
           3       0.36      0.65      0.46       510
           4       0.58      0.23      0.33       399

    accuracy                           0.40      2210
   macro avg       0.40      0.34      0.32      2210
weighted avg       0.41      0.40      0.36      2210



#### Part 2: Logistic Regression