In [1]:
import os
import nltk
import nltk.corpus
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.naive_bayes import GaussianNB
import string
from nltk.tokenize import word_tokenize 
from nltk.probability import FreqDist
from nltk.util import bigrams, trigrams, ngrams
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction import FeatureHasher

In [2]:
df = pd.read_csv ("train.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         60000 non-null  int64 
 1   comment    60000 non-null  object
 2   subreddit  60000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


In [3]:
df.head()

Unnamed: 0,id,comment,subreddit
0,0,I think prestige points should not expire ever...,leagueoflegends
1,1,Whats going to happen with them if they will b...,europe
2,2,Anecdotal evidence is anecdotal. Clearly by “e...,gameofthrones
3,3,"Look dude, with all due respect, your music is...",Music
4,4,Hope he gets the doomhammer back!,wow


In [4]:
def clean_text(df):
    
    all_comments = list()
    lines = df["comment"].values.tolist()
    for text in lines:
        text = text.lower()
        
        pattern = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub("", text)
        
        emoji = re.compile("["
                           u"\U0001F600-\U0001FFFF"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        text = emoji.sub(r'', text)
        
        text = re.sub(r"i'm", "i am", text)
        text = re.sub(r"he's", "he is", text)
        text = re.sub(r"she's", "she is", text)
        text = re.sub(r"that's", "that is", text)        
        text = re.sub(r"what's", "what is", text)
        text = re.sub(r"where's", "where is", text) 
        text = re.sub(r"\'ll", " will", text)  
        text = re.sub(r"\'ve", " have", text)  
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"won't", "will not", text)
        text = re.sub(r"don't", "do not", text)
        text = re.sub(r"did't", "did not", text)
        text = re.sub(r"can't", "can not", text)
        text = re.sub(r"it's", "it is", text)
        text = re.sub(r"couldn't", "could not", text)
        text = re.sub(r"have't", "have not", text)
        text = re.sub(r"nt", "not", text)
    
        
        text = re.sub(r"[,.\"!@#$%^&*(){}?/;`~:<>+=-]", "", text)
        
        tokens = word_tokenize(text)
        
        table = str.maketrans('', '', string.punctuation)
        
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        
        stop_words = set(stopwords.words("english"))
        stop_words.discard("not")
        
        words = [w for w in words if not w in stop_words]
        words = ' '.join(words)
        
        all_comments.append(words)
    return all_comments

all_comments = clean_text(df)
all_comments[0:2]

['think prestige poinots not expire ever skins buy available set duration exemple year release another skin vault old one making also limitededition skin also please love god not rerelease skins need grind prestige shop would suck everyone grinded',
 'whats going happen refused asilum appeal']

In [5]:
vect = TfidfVectorizer(max_features=30000, strip_accents='ascii')
vect.fit(all_comments)
vocabulaire = vect.get_feature_names()

In [6]:
bag_of_words = vect.transform(all_comments)
bag_of_words.shape

(60000, 30000)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(bag_of_words, df['subreddit'], test_size = 0.1, random_state = 42)

In [8]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
predicted= clf.predict(X_test)
print("MultinomialNB Accuracy:",metrics.accuracy_score(y_test, predicted))

MultinomialNB Accuracy: 0.5125


# TEST

In [9]:
df_test = pd.read_csv ("test.csv")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       20000 non-null  int64 
 1   comment  20000 non-null  object
dtypes: int64(1), object(1)
memory usage: 234.4+ KB


In [10]:
df_test.head()

Unnamed: 0,id,comment
0,0,Holy shit a shot counter.
1,1,It doesn't matter that it isn't hard to rememb...
2,2,I find it funny that this is downvoted
3,3,They are really getting ridicoulous with all t...
4,4,He's Eden's best friend


In [11]:
cleaned_test = clean_text(df_test)
cleaned_test[0:2]

['holy shit shot counoter',
 'nt matter nt hard remember fractions nt complicated right poinot everyone trying make fractions unnecessarily harder using metric system metric wrenches drill bits stuff go millimeters mm bigger size previous one need able counot undeniably easier fractions']

In [12]:
cleaned_test_vect = vect.transform(cleaned_test)

In [13]:
y_pred = clf.predict(cleaned_test_vect)

# Submission

In [14]:
submission = zip(list(range(len(y_pred))), y_pred)
test_df = pd.DataFrame(submission, columns=['Id','Category'])
test_df.to_csv('submission.csv', index = False, header=True)

# Random Forest

In [15]:
# X, y = make_classification(n_samples=1000, n_features=4,n_informative=2, n_redundant=0,random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=1, random_state=0)
clf.fit(bag_of_words, df['subreddit'])

a = clf.predict(cleaned_test_vect)

In [16]:
clf.score(bag_of_words, df['subreddit'])

0.15006666666666665

# Logistic Regression

In [23]:
clf_lr = LogisticRegression(max_iter=100)
clf_lr.fit(bag_of_words, df['subreddit'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [24]:
b = clf_lr.predict(X_test)
clf_lr.score(bag_of_words, df['subreddit'])

0.6965666666666667