In [1]:
import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [2]:
df = pd.read_csv ("train.csv")
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         60000 non-null  int64 
 1   comment    60000 non-null  object
 2   subreddit  60000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


(60000, 3)

In [3]:
df.head()

Unnamed: 0,id,comment,subreddit
0,0,I think prestige points should not expire ever...,leagueoflegends
1,1,Whats going to happen with them if they will b...,europe
2,2,Anecdotal evidence is anecdotal. Clearly by “e...,gameofthrones
3,3,"Look dude, with all due respect, your music is...",Music
4,4,Hope he gets the doomhammer back!,wow


# Cleaning

In [4]:
token = RegexpTokenizer('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)
text_counts = cv.fit_transform(df['comment'])

In [5]:
text_counts.shape

(60000, 3267)

# Model

In [6]:
X_train, X_test, y_train, y_test = train_test_split(text_counts, df['subreddit'], test_size=0.2, random_state=1)

In [7]:
class NaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)
        
        # init mean, var, priors
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._vae = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)
        
        for c in self._classes:
            X_c = X[c==y]
            self._mean[c,:]=X_c.mean(axis=0)
            self._var[c,:]=X_c.mean(axis=0)
            self._priors[c]=X_c.shape[0] / float(n_samples) # The frequently of how class c is happening
            
            
            
            
    def predict(self, X): # For multiple samples
        y_pred = [self._predict(x) for x in X]
        return y_pred
        
    def _predict(self, x):  # For single samples
        posteriors = []
        
        for idx, c in enumerate(self.classes):
            prior = np.log(self.priors[idx])
            class_conditional = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + class_conditional
            posteriors.append(posterior)
            
            
        return self._classes[np.argmax(posteriors)]
            
# Probability density function     
    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
# Exponential function
        numerator = np.exp(- (x-mean)**2 / (2 * var))  
        denominator = np.sqr(2* np.pi * var)
        return numerator / denominator
    
        
        

# Prediction

In [8]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [9]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)
print("Naive Bayes classification accuracy", accuracy(y_test, predictions))

Naive Bayes classification accuracy 0.04708333333333333


# Test Dataset

In [10]:
df_test = pd.read_csv ("test.csv")
df_test.info()
df_test.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       20000 non-null  int64 
 1   comment  20000 non-null  object
dtypes: int64(1), object(1)
memory usage: 234.4+ KB


(20000, 2)

In [11]:
token_test = RegexpTokenizer('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
cv_test = CountVectorizer(lowercase=True, stop_words='english', ngram_range = (1,1), tokenizer = token_test.tokenize)
text_counts_test = cv_test.fit_transform(df_test['comment'])

In [12]:
text_counts_test.shape

(20000, 956)

In [13]:
# X_train_test, X_test_test, y_train_test, y_test_test = train_test_split(text_counts_test, df_test['subreddit'], test_size=0.2, random_state=1)

In [14]:
# predicted_test = clf.predict(X_test_test)
# a = df['subreddit'][:20000]

In [15]:
# predicted = clf.predict(text_counts_test)

In [16]:
# clf.fit(text_counts_test)
# predicted_test = clf.predict(X_test)