<a href="https://colab.research.google.com/github/HungYangChang/ECSE-551-Mini-project2/blob/main/miniproject2_supplemental.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mini-Project 2 Supplemental Results

In [None]:
# Import relevant modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import text
from sklearn import model_selection
from sklearn import svm
import time
import nltk

# Load and Prepare Data

In [None]:
# Load testing and training data
url = "https://raw.githubusercontent.com/jonarsenault/ecse551data/master/train.csv"
train_data = pd.read_csv(url)

url = "https://raw.githubusercontent.com/jonarsenault/ecse551data/master/test.csv"
test_data = pd.read_csv(url)

# Parameters
number_of_samples = None  # Set to None to test entire data set
stop_words = text.ENGLISH_STOP_WORDS
np.random.seed(10)

# For some reason, need to shuffle even if using all data
if number_of_samples is None:
    number_of_samples = len(train_data)

train_data = train_data.sample(number_of_samples).reset_index(drop=True)

X = train_data["body"]
y = train_data["subreddit"]

X_test = test_data["body"]

In [None]:
# Parameters

# Add some stop words
more_stop_words = [
    "u",
    "just",
    "think",
    "https",
    "www",
    "don't",
    "like",
    "need",
    "it",
    "you're",
    "use",
    "reddit",
    "thing",
    "I'm",
    "things",
    "good",
    "really",
    "want",
    "maybe",
    "imgur",
    "com",
    "don",
    "actually",
    "that",
    "make",
    "lot",
    "different",
    "doing",
    "that",
    "better",
    "going",
    "great",
]

fewer_stop_words = [
    "u",
    "just",
    "think",
    "don't",
    "like",
    "need",
    "it",
    "you're",
    "use",
    "thing",
    "I'm",
    "things",
    "good",
    "really",
    "want",
    "maybe",
    "don",
    "actually",
    "that",
    "make",
    "lot",
    "different",
    "doing",
    "that",
    "better",
    "going",
    "great",
]
almost_all_stop_words = stop_words.union(fewer_stop_words)
all_stop_words = stop_words.union(more_stop_words)

# Define Naive Bayes class

In [None]:

class NaiveBayes:
    def __init__(self, alpha=0.01, prior="learn"):
        """Constructor"""

        self.alpha = alpha
        self.prior = prior
    def fit(self, X, y):
        """Obtain naive bayes parameters from training data. X is input data, 
        y are class labels"""

        # Convert sparse array to dense array
        X = X.toarray()

        # Compute each class probability
        class_counts = y.value_counts()

        num_labels = len(class_counts)

        if self.prior == "learn":
          # Learn the class probabilities from the training data
          self.class_probabilities = class_counts / len(y)
        elif self.prior == "uniform":
          # Assume a uniform prior
          self.class_probabilities = pd.Series(np.repeat(1/num_labels, num_labels), 
                                               index = class_counts.index)

        # Sort in alphabetical order
        self.class_probabilities.sort_index(inplace=True)
        class_counts.sort_index(inplace=True)

        # Compute parameters
        features_count = np.empty((num_labels, X.shape[1]))

        y_numpy = y.to_numpy()
        for i in range(num_labels):

            label = self.class_probabilities.index[i]
            X_this_label = X[np.nonzero(y_numpy == label), :]

            features_count[i,:] = np.sum(X_this_label, axis=1)

        # Laplace smoothing
        smoothed_numerator = features_count + self.alpha
        smoothed_denominator = np.sum(smoothed_numerator,axis=1).reshape(-1,1)

        self.parameters = pd.DataFrame(smoothed_numerator / smoothed_denominator, index=self.class_probabilities.index)
        
    def predict(self, X):
        """Predict class of text"""

        X = X.toarray()

        delta = pd.DataFrame(columns=self.class_probabilities.index)
        for label in self.class_probabilities.index:

            # Get probability of currect class P(y=k)
            class_probability = self.class_probabilities[label]

            # Get theta_j for currect class
            theta_j_class = self.parameters.loc[label, :].to_numpy()

            # Compute P(x_j | y = k)
            prob_features_given_y = (theta_j_class ** X) * (1 - theta_j_class) ** (
                1 - X
            )

            # Compute P(x | y = k)
            prob_sample_given_y = np.prod(prob_features_given_y, axis=1)

            # Compute P(y) * P(x | y = k)
            term1 = np.log(class_probability)
            term2 = np.sum(X * np.log(theta_j_class), axis=1)
            term3 = np.sum((1 - X) * np.log(1 - theta_j_class), axis=1)
            delta_k = term1 + term2 + term3

            # Append
            delta[label] = delta_k

        predicted_class = delta.idxmax(axis=1)

        return predicted_class.to_list()

    def score(self, X, y):
        """Compute accuracy of naive bayes model"""      

        y_pred = self.predict(X)

        accuracy = np.count_nonzero(y == y_pred) / len(y_pred)

        return accuracy

    def get_params(self, deep=True):
        """Getter for parameters"""

        params = {"alpha": self.alpha,
                  "prior": self.prior}

        return params

    def set_params(self, **parameters):
        "Setter for parameters"
        for parameter, value in parameters.items():
            setattr(self, parameter, value)

        return self


# Choosing parameters and preprocessing methods for Multinomial Naive Bayes model

In [None]:
#Define functions of lemmatization and stemming

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer 

class LemmaTokenizer_1:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos ="v") for t in word_tokenize(doc)]

class LemmaTokenizer_2:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos ="v") for t in word_tokenize(doc) if t.isalpha()]

class StemTokenizer:
     def __init__(self):
       self.wnl =PorterStemmer()
     def __call__(self, doc):
       return [self.wnl.stem(t) for t in word_tokenize(doc) if t.isalpha()]

## No preprocessing

In [None]:
## Test with no preprocessing

t_start = time.time()

vectorizer = CountVectorizer()
naive_bayes_model = NaiveBayes(alpha = 0.01, prior="uniform")

pipe = Pipeline(
    [("vect", vectorizer),  ("classify", naive_bayes_model)]
)

cross_val_accuracy = model_selection.cross_val_score(pipe, X, y, n_jobs=-1)

t_end = time.time()

print(f"The 5-fold cross-validation accuracy is: {np.mean(cross_val_accuracy):.5f}")
print(f"Run time: {t_end-t_start: .3f} seconds")


The 5-fold cross-validation accuracy is: 0.89449
Run time:  119.945 seconds


## Normalization

In [None]:
## Test: normalize

t_start = time.time()

vectorizer = CountVectorizer()
normalizer = Normalizer()
naive_bayes_model = NaiveBayes(alpha = 0.01, prior="uniform")

pipe = Pipeline(
    [("vect", vectorizer), ("norm", normalizer),  ("classify", naive_bayes_model)]
)

cross_val_accuracy = model_selection.cross_val_score(pipe, X, y, n_jobs=-1)

t_end = time.time()

print(f"The 5-fold cross-validation accuracy is: {np.mean(cross_val_accuracy):.5f}")
print(f"Run time: {t_end-t_start: .3f} seconds")

The 5-fold cross-validation accuracy is: 0.90563
Run time:  100.661 seconds


## Remove stop words

In [None]:
##Test: remove stop-words

t_start = time.time()

vectorizer = CountVectorizer(stop_words=all_stop_words)
normalizer = Normalizer()
naive_bayes_model = NaiveBayes(alpha = 0.01, prior="uniform")

pipe = Pipeline(
    [("vect", vectorizer), ("norm", normalizer),  ("classify", naive_bayes_model)]
)

cross_val_accuracy = model_selection.cross_val_score(pipe, X, y, n_jobs=-1)

t_end = time.time()

print(f"The 5-fold cross-validation accuracy is: {np.mean(cross_val_accuracy):.5f}")
print(f"Run time: {t_end-t_start: .3f} seconds")

The 5-fold cross-validation accuracy is: 0.90658
Run time:  100.708 seconds


## Lemmatization

In [None]:
##Test: lemmatization that works on colab

t_start = time.time()

from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer:
     def __init__(self):
       self.wnl = WordNetLemmatizer()
     def __call__(self, doc):
       return [self.wnl.lemmatize(t,pos ="v") for t in word_tokenize(doc) if t.isalpha()]
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from nltk.corpus import wordnet
from nltk.tokenize.treebank import TreebankWordDetokenizer

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

X1 = X.copy()

for i in range(X1.shape[0]):
    X1[i] = X1[i].lower()
    X1[i] = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(X1[i])]
    X1[i] = TreebankWordDetokenizer().detokenize(X1[i]) #detokenize
print(X1[0]) 
#Although it was defined in lemmarizer function to remove isalpha, this print shows that it doesn't remove all non-alphabetic characters. 
#A second step is taken to ensure is alpha in the next test 

vectorizer = CountVectorizer()
normalizer = Normalizer()
naive_bayes_model = NaiveBayes(alpha = 0.01, prior="uniform")

pipe = Pipeline(
    [("vect", vectorizer), ("norm", normalizer),  ("classify", naive_bayes_model)]
)

cross_val_accuracy = model_selection.cross_val_score(pipe, X1, y, n_jobs=-1)

t_end = time.time()

print(f"The 5-fold cross-validation accuracy is: {np.mean(cross_val_accuracy):.5f}")
print(f"Run time: {t_end-t_start: .3f} seconds")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
**the follow be the deal highlight: ** * **price: ** $40b nvidia stock and cash * **accretion: ** immediately accretive to nvidia non-gaap gross margin and eps * **cambridge investment: ** create “ world-class ” ai research and education center for healthcare life science robotics and self-driving car . also build an arm/nvidia-based ai supercomputer for research * **softbank ownership: ** will keep 10% stake in new entity **the follow be the operating highlight: ** * **arm operating structure: ** arm will operate a an nvidia division * **arm locality: ** arm will continue to be 

## Lemmatization and remove non-alphabetic characters

In [None]:
#Tests: lemmatization with isalpha that works on colab

t_start = time.time()

X1 = X.copy()
for i in range(X1.shape[0]):
    X1[i] = X1[i].lower()
    X1[i] = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(X1[i])]
    X1[i] = [w for w in X1[i] if w.isalpha()] #Remove non-alphabetic words
    X1[i] = TreebankWordDetokenizer().detokenize(X1[i]) #detokenize

vectorizer = CountVectorizer()
normalizer = Normalizer()
naive_bayes_model = NaiveBayes(alpha = 0.01, prior="uniform")

pipe = Pipeline(
    [("vect", vectorizer), ("norm", normalizer),  ("classify", naive_bayes_model)]
)

cross_val_accuracy = model_selection.cross_val_score(pipe, X1, y, n_jobs=-1)

t_end = time.time()

print(f"The 5-fold cross-validation accuracy is: {np.mean(cross_val_accuracy):.5f}")
print(f"Run time: {t_end-t_start: .3f} seconds")

The 5-fold cross-validation accuracy is: 0.89864
Run time:  242.344 seconds


## ngram (1,2) (1,3) (1,4) with max_features=40000

In [None]:
##Tests: ngram (1,2)

t_start = time.time()

vectorizer = CountVectorizer(ngram_range=(1, 2), max_features = 40000)
normalizer = Normalizer()
naive_bayes_model = NaiveBayes(alpha = 0.01, prior="uniform")

pipe = Pipeline(
    [("vect", vectorizer), ("norm", normalizer), ("classify", naive_bayes_model)]
)

cross_val_accuracy = model_selection.cross_val_score(pipe, X, y, n_jobs=-1)

t_end = time.time()

print(f"The 5-fold cross-validation accuracy is: {np.mean(cross_val_accuracy):.5f}")
print(f"Run time: {t_end-t_start: .3f} seconds")



The 5-fold cross-validation accuracy is: 0.88836
Run time:  127.162 seconds


In [None]:
##Tests: ngram (1,3)

t_start = time.time()

vectorizer = CountVectorizer(ngram_range=(1,3), max_features = 40000)
normalizer = Normalizer()
naive_bayes_model = NaiveBayes(alpha = 0.01, prior="uniform")

pipe = Pipeline(
    [("vect", vectorizer), ("norm", normalizer),  ("classify", naive_bayes_model)]
)

cross_val_accuracy = model_selection.cross_val_score(pipe, X, y, n_jobs=-1)

t_end = time.time()

print(f"The 5-fold cross-validation accuracy is: {np.mean(cross_val_accuracy):.5f}")
print(f"Run time: {t_end-t_start: .3f} seconds")

The 5-fold cross-validation accuracy is: 0.87265
Run time:  154.061 seconds


In [None]:
##Tests: ngram (1,4)

t_start = time.time()

vectorizer = CountVectorizer(ngram_range=(1,4), max_features = 40000)
normalizer = Normalizer()
naive_bayes_model = NaiveBayes(alpha = 0.01, prior="uniform")

pipe = Pipeline(
    [("vect", vectorizer), ("norm", normalizer),  ("classify", naive_bayes_model)]
)

cross_val_accuracy = model_selection.cross_val_score(pipe, X, y, n_jobs=-1)

t_end = time.time()

print(f"The 5-fold cross-validation accuracy is: {np.mean(cross_val_accuracy):.5f}")
print(f"Run time: {t_end-t_start: .3f} seconds")

The 5-fold cross-validation accuracy is: 0.86539
Run time:  166.375 seconds
