# Text Classification

In [67]:
import pandas as pd
import numpy as np

### Loading sentences and labels


In [68]:
sentences = pd.read_csv("stanfordSentimentTreebank/datasetSentences.txt", sep="\t")
labels = pd.read_csv("stanfordSentimentTreebank/sentiment_labels.txt", sep="|")


In [69]:
data = sentences.merge(labels, left_on="sentence_index", right_on="phrase ids", how="left")
data = data[["sentence", "sentiment values"]]

In [70]:
data.head()

Unnamed: 0,sentence,sentiment values
0,The Rock is destined to be the 21st Century 's...,0.5
1,The gorgeously elaborate continuation of `` Th...,0.44444
2,Effective but too-tepid biopic,0.5
3,If you sometimes like to go to the movies to h...,0.42708
4,"Emerges as something rare , an issue movie tha...",0.375


### Converting Scores to labels

In [71]:
def map_sentiment(score):
    if 0 <= score <= 0.2:
        return 0  
    elif 0.2 < score <= 0.4:
        return 1
    elif 0.4 < score <= 0.6:
        return 2
    elif 0.6 < score <= 0.8:
        return 3
    elif 0.8 < score <= 1.0:
        return 4  

In [72]:
data["label"] = data["sentiment values"].apply(map_sentiment)

In [73]:
data.head()

Unnamed: 0,sentence,sentiment values,label
0,The Rock is destined to be the 21st Century 's...,0.5,2
1,The gorgeously elaborate continuation of `` Th...,0.44444,2
2,Effective but too-tepid biopic,0.5,2
3,If you sometimes like to go to the movies to h...,0.42708,2
4,"Emerges as something rare , an issue movie tha...",0.375,1


## Implemenation of Naiive Bayes

In [74]:
class NaiveBayesClassifier:
    def __init__(self, alpha=1.0):
        """
        Initialize the Naive Bayes Classifier.
        alpha: Smoothing parameter for Laplace smoothing.
        """
        self.alpha = alpha  # Laplace smoothing
        self.class_priors = None  # Prior probabilities P(Class)
        self.word_probs = None  # Likelihood P(Word | Class)
        self.vocab = None  # Vocabulary
    
    def fit(self, X, y):
        """
        Train the classifier using text data.
        X: List of text samples (sentences).
        y: Corresponding class labels.
        """
        # Create vocabulary
        all_words = set(word for text in X for word in text.split())
        self.vocab = {word: i for i, word in enumerate(all_words)}
        V = len(self.vocab)  # Vocabulary size
        
        # Get unique class labels
        classes = np.unique(y)
        num_classes = len(classes)
        
        # Initialize probability tables
        self.class_priors = np.zeros(num_classes)
        word_counts = np.zeros((num_classes, V))  # Word frequency per class
        class_counts = np.zeros(num_classes)  # Total words per class
        
        # Compute class priors and word counts
        for i, cls in enumerate(classes):
            class_indices = np.where(y == cls)[0]
            self.class_priors[i] = len(class_indices) / len(y)  # P(Class)
            
            for idx in class_indices:
                words = X[idx].split()
                for word in words:
                    if word in self.vocab:
                        word_index = self.vocab[word]
                        word_counts[i, word_index] += 1
                        class_counts[i] += 1
        
        # Apply Laplace Smoothing: P(Word | Class) = (word_count + alpha) / (total_words + alpha * V)
        self.word_probs = (word_counts + self.alpha) / (class_counts[:, None] + self.alpha * V)
    
    def predict(self, X):
        """
        Predict the class of new text samples.
        X: List of text samples (sentences).
        """
        predictions = []
        log_class_priors = np.log(self.class_priors)
        
        for text in X:
            words = text.split()
            log_probs = log_class_priors.copy()  # Initialize with log priors
            
            for word in words:
                if word in self.vocab:
                    word_index = self.vocab[word]
                    log_probs += np.log(self.word_probs[:, word_index])
            
            predictions.append(np.argmax(log_probs))  # Choose class with highest log probability
        
        return np.array(predictions)

In [75]:
shuffled_indices = np.random.permutation(len(data))  # Get shuffled indices
data = data.iloc[shuffled_indices].reset_index(drop=True)  # Shuffle data
train_size = int(0.8 * len(data))
X_train, y_train = data["sentence"][:train_size], np.array(data["label"][:train_size])
X_test, y_test = data["sentence"][train_size:], np.array(data["label"][train_size:])

In [76]:
nb = NaiveBayesClassifier()
nb.fit(X_train, y_train)

# Predict on test data
y_pred = nb.predict(X_test)

# Evaluate accuracy
accuracy = np.mean(y_pred == y_test)
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")


Test Accuracy: 0.5116
Training samples: 9484, Test samples: 2371


### Comparison with built in

In [77]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

# Create a pipeline with CountVectorizer + MultinomialNB
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Train the model
model.fit(X_train, y_train)

# Predict using the trained model
y_pred_sklearn = model.predict(X_test)

# Evaluate accuracy
accuracy_sklearn = accuracy_score(y_test, y_pred_sklearn)
print(f"Scikit-Learn Naïve Bayes Accuracy: {accuracy_sklearn:.4f}")

 

Scikit-Learn Naïve Bayes Accuracy: 0.5049
