In [1]:
import os
import nltk
import nltk.corpus
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist
from nltk.util import bigrams, trigrams, ngrams
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [2]:
df = pd.read_csv ("train.csv")
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         60000 non-null  int64 
 1   comment    60000 non-null  object
 2   subreddit  60000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


(60000, 3)

In [17]:
df.head

<bound method NDFrame.head of           id                                            comment  \
0          0  I think prestige points should not expire ever...   
1          1  Whats going to happen with them if they will b...   
2          2  Anecdotal evidence is anecdotal. Clearly by “e...   
3          3  Look dude, with all due respect, your music is...   
4          4                  Hope he gets the doomhammer back!   
...      ...                                                ...   
59995  59995                       Yo this guy Luka pretty good   
59996  59996                      Unplug these things right now   
59997  59997  Well said. Do you think they’ll resonate with ...   
59998  59998  So we can impeach a president for lying? Pleas...   
59999  59999  Too broad dude, get ready for the shit my pant...   

             subreddit  
0      leagueoflegends  
1               europe  
2        gameofthrones  
3                Music  
4                  wow  
...            

In [3]:
def clean_text(df):
    all_reviews = list()
    lines = df["comment"].values.tolist()
    for text in lines:
        text = text.lower()
        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub('', text)
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)
        tokens = word_tokenize(text)
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words("english"))
        stop_words.discard("not")
        PS = PorterStemmer()
#         words = [w for w in words if not w in stop_words]
        words = [PS.stem(w) for w in words if not w in stop_words]
        words = ' '.join(words)
        all_reviews.append(words)
    return all_reviews

all_reviews = clean_text(df)
all_reviews[0:2]

['think prestig point not expir ever skin buy avail set durat exempl year releas anoth skin vault old one make also limitededit skin also pleas love god nt rereleas skin need grind prestig shop would suck everyon grind',
 'what go happen refus asilum appeal']

In [6]:
c = all_comments
filtered_sentence = [] 
freq_count_limit = FreqDist()
lemmatizer=WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

for i in c:
    comment_tokens = word_tokenize(i)
    
    for words in comment_tokens:
        if words not in stop_words: 
            filtered_sentence.append(words) 
        
            limit_words = lemmatizer.lemmatize(words)
#     for word in root_words:
            freq_count_limit[limit_words.lower()]+=1
freq_count_limit

FreqDist({'nt': 12955, 'like': 9662, 'get': 7121, 'one': 5971, 'peopl': 5884, 'would': 5745, 'think': 4914, 'go': 4786, 'time': 4431, 'make': 4394, ...})

In [46]:
token = RegexpTokenizer(r'[a-z]+')
cv = CountVectorizer(ngram_range = (1,1), tokenizer = token.tokenize)
text_counts2 = cv.fit_transform(all_reviews)

In [47]:
text_counts2.shape

(60000, 41857)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    text_counts2, df['subreddit'], test_size=0.2, random_state=1)

In [52]:
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

MultinomialNB Accuracy: 0.47358333333333336


# TEST Dataset

In [20]:
df_test = pd.read_csv ("test.csv")
df_test.info()
df_test.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       20000 non-null  int64 
 1   comment  20000 non-null  object
dtypes: int64(1), object(1)
memory usage: 234.4+ KB


(20000, 2)

In [21]:
df_test.head

<bound method NDFrame.head of           id                                            comment
0          0                          Holy shit a shot counter.
1          1  It doesn't matter that it isn't hard to rememb...
2          2             I find it funny that this is downvoted
3          3  They are really getting ridicoulous with all t...
4          4                            He's Eden's best friend
...      ...                                                ...
19995  19995  These officials are almost as incompetent as o...
19996  19996  honestly the Patriot act really fucked our com...
19997  19997  My friend is now looking online for a thanos c...
19998  19998  I really liked Thor Ragnarok and both Guardian...
19999  19999                      last info changes everything.

[20000 rows x 2 columns]>

In [22]:
all_reviews_test = clean_text(df_test)

In [26]:
c = all_reviews_test
filtered_sentence = [] 
freq_count_limit = FreqDist()
lemmatizer=WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

for i in c:
    comment_tokens = word_tokenize(i)
    for words in comment_tokens:
        if words not in stop_words: 
            filtered_sentence.append(words) 
        
            limit_words = lemmatizer.lemmatize(words)
#     for word in root_words:
            freq_count_limit[limit_words.lower()]+=1
freq_count_limit

FreqDist({'nt': 4269, 'like': 3125, 'get': 2306, 'one': 2024, 'peopl': 1982, 'would': 1928, 'time': 1535, 'think': 1500, 'go': 1470, 'make': 1401, ...})

In [35]:
token_test = RegexpTokenizer(r'[a-z]+')
cv_test = CountVectorizer(ngram_range = (1,1), tokenizer = token.tokenize)
text_counts_test = cv.fit_transform(all_reviews_test)

In [36]:
X_train_test, X_test_test, y_train_test, y_test_test = train_test_split(
    text_counts_test, df_test['id'], test_size=0.2, random_state=1)

In [37]:
clf.predict(X_test_test)
predicted_test= clf.predict(X_test_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test_test, predicted_test))

ValueError: dimension mismatch