In [1]:
import os
import nltk
import nltk.corpus
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn import datasets
import string
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist
from nltk.util import bigrams, trigrams, ngrams
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


In [2]:
df = pd.read_csv ("train.csv")
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         60000 non-null  int64 
 1   comment    60000 non-null  object
 2   subreddit  60000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


(60000, 3)

In [3]:
# Looking at the text
df.head(10)

Unnamed: 0,id,comment,subreddit
0,0,I think prestige points should not expire ever...,leagueoflegends
1,1,Whats going to happen with them if they will b...,europe
2,2,Anecdotal evidence is anecdotal. Clearly by “e...,gameofthrones
3,3,"Look dude, with all due respect, your music is...",Music
4,4,Hope he gets the doomhammer back!,wow
5,5,Trading for coaches has happened before,nfl
6,6,"Considering what the kid has already seen, did...",movies
7,7,Nah clearly it's Tom Bombadil,movies
8,8,Time to go play some Elite Dangerous in VR I t...,Music
9,9,https://i.imgur.com/DUdy0KL.jpg,funny


In [4]:
def clean_text(df):
    all_reviews = list()
    lines = df["comment"].values.tolist()
    for text in lines:
        text = text.lower()
        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub('', text)
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)
        tokens = word_tokenize(text)
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words("english"))
        stop_words.discard("not")
        PS = PorterStemmer()
#         words = [w for w in words if not w in stop_words]
        words = [PS.stem(w) for w in words if not w in stop_words]
        words = ' '.join(words)
        all_reviews.append(words)
    return all_reviews

all_reviews = clean_text(df)

In [5]:
token = RegexpTokenizer(r'[a-z]+')
cv = TfidfVectorizer(ngram_range = (1,1), tokenizer = token.tokenize)
text_counts_m = cv.fit(all_reviews)
text_counts = text_counts_m.transform(all_reviews)



In [6]:
text_counts.shape

(60000, 41857)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(text_counts, df['subreddit'], test_size=0.2, random_state=42)

# Naive Bayes from scratch first try 1:

In [49]:
# Naive Bayes1:
class NaiveBayes:

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors =  np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y==c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + posterior
            posteriors.append(posterior)
            
        # return class with highest posterior probability
        return self._classes[np.argmax(posteriors)]
            

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(- (x-mean)**2 / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

In [50]:
# # Naive Bayes1:
X = df['comment']
y = df['subreddit']
nb = NaiveBayes()
nb.fit(X, y)
predictions = nb.predict(X_test)


def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy


print("Naive Bayes classification accuracy", accuracy(y_test, predictions))

ValueError: not enough values to unpack (expected 2, got 1)

# Naive Bayes from scratch second try 2:

In [39]:

# X = df['comment']
# y = df['subreddit']
# class BernoulliNB(object):
#     def __init__(self, alpha=1.0):
#         self.alpha = alpha

#     def fit(self, X, y):
#         count_sample = X.shape[0]
#         # group by class
#         separated = [[x for x, t in zip(X, y) if t == c] for c in np.unique(y)]
#         # class prior
#         self.class_log_prior_ = [np.log(len(i) / count_sample) for i in separated]
#         # count of each word
#         count = np.array([np.array(i).sum(axis=0) for i in separated]) + self.alpha

#         smoothing = 2 * self.alpha
#         # number of documents in each class + smoothing
#         n_doc = np.array([len(i) + smoothing for i in separated])
#         print(n_doc)

In [40]:
# # Naive Bayes2:
# nb = BernoulliNB(alpha=1).fit(text_counts, df['subreddit'])

# Naive Bayes sklearn

In [41]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

MultinomialNB Accuracy: 0.5


# Test

In [31]:
df_test = pd.read_csv ("test.csv")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       20000 non-null  int64 
 1   comment  20000 non-null  object
dtypes: int64(1), object(1)
memory usage: 234.4+ KB


In [32]:
cleaned_test = clean_text(df_test)

In [33]:
cleaned_test_vect = text_counts_m.transform(cleaned_test)

In [34]:
y_pred = clf.predict(cleaned_test_vect)

# Submission

In [35]:
submission = zip(list(range(len(y_pred))), y_pred)
test_df = pd.DataFrame(submission, columns=['Id','Category'])
test_df.to_csv('submission.csv', index = False, header=True)