In [1]:
# import ml packages

import string

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.model_selection as model_selection
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer

from sklearn.feature_extraction.text import (CountVectorizer, HashingVectorizer, TfidfVectorizer)
from imblearn.over_sampling import SMOTE

# from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC


from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [2]:
# import dataset
train_df = pd.read_csv("../AskReddit Dataset/AskReddit Dataset/train.csv")
test_df = pd.read_csv("../AskReddit Dataset/AskReddit Dataset/test.csv")
train_df.head()

Unnamed: 0,qid,question_text,target
0,a3dee568776c08512c89,What is the role of Lua in Civ4?,0
1,bdb84f519e7b46e7b7bb,What are important chapters in Kannada for 10 ...,0
2,29c88db470e2eb5c97ad,Do musicians get royalties from YouTube?,0
3,3387d99bf2c3227ae8f1,What is the difference between Scaling Social ...,0
4,e79fa5038f765d0f2e7e,Why do elevators go super slow right before th...,0


In [3]:
# see value count order of target
train_df["target"].value_counts()

0    612656
1     40405
Name: target, dtype: int64

In [5]:
# create a preprocessing class
class Preprocessor:
    def __init__(self, df) -> None:
        self.df = df

    # convert all charecters to lower case
    def convertToLower(self):
        self.df["question_text"] = self.df["question_text"].apply(lambda x: x.lower())
        return self.df

    # remove stop words
    def removeStopWords(self):
        stop = stopwords.words("english")
        self.df["question_text"] = self.df["question_text"].apply(
            lambda x: " ".join([word for word in x.split() if word not in stop])
        )
        return self.df

    # remove punctuation
    def removePunctuation(self):
        self.df["question_text"] = self.df["question_text"].str.replace("[^\w\s]", "")
        return self.df

    # remove numbers
    def removeNumbers(self):
        self.df["question_text"] = self.df["question_text"].str.replace("[0-9]", "")
        return self.df

    # remove whitespaces
    def removeWhitespaces(self):
        self.df["question_text"] = self.df["question_text"].apply(
            lambda x: " ".join(x.split())
        )
        return self.df

    # remove urls
    def removeURLs(self):
        self.df["question_text"] = self.df["question_text"].str.replace(
            "https?://\S+|www\.\S+", ""
        )
        return self.df

    # snowball stemmer algorithm
    def snowballstemmer(self):
        stemmer = SnowballStemmer()

        def stem_words(text):
            return " ".join([stemmer.stem(word) for word in text.split()])

        self.df["question_text"] = self.df["question_text"].apply(
            lambda x: stem_words(x)
        )
        return self.df

    # port stemmer algorithm
    def porterstemmer(self):
        stemmer = PorterStemmer()

        def stem_words(text):
            return " ".join([stemmer.stem(word) for word in text.split()])

        self.df["question_text"] = self.df["question_text"].apply(
            lambda x: stem_words(x)
        )
        return self.df

    # lemmatizing
    def lemmatize(self):
        from nltk.stem import WordNetLemmatizer

        lemmatizer = WordNetLemmatizer()

        def lemmatize_words(text):
            return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

        self.df["question_text"] = self.df["question_text"].apply(
            lambda x: lemmatize_words(x)
        )
        return self.df

    # remove id and index columns
    def removeUnwantedCols(self, col):
        print(self.df.shape)
        self.df = self.df.drop(col, axis=1)
        return self.df

    # word tokenization using nltk
    def wordTokenization(self):
        self.df["question_text"] = self.df["question_text"].apply(
            lambda x: nltk.word_tokenize(x)
        )
        return self.df

    def preprocess(self):
        self.df = self.convertToLower()
        # self.df = self.removeStopWords()
        self.df = self.removePunctuation()
        # self.df = self.removeNumbers()
        # self.df = self.removeURLs()
        # self.df = self.removeWhitespaces()
        # self.df = self.snowballstemmer()
        # self.df = self.porterstemmer()
        # self.df = self.lemmatize()
        self.df = self.wordTokenization()
        self.df = self.removeUnwantedCols(["qid"])
        return self.df

In [6]:
preproccesor = Preprocessor(train_df)
preprocessed_df = preproccesor.preprocess()
preprocessed_df.head()

  self.df["question_text"] = self.df["question_text"].str.replace("[^\w\s]", "")


(653061, 3)


Unnamed: 0,question_text,target
0,"[what, is, the, role, of, lua, in, civ4]",0
1,"[what, are, important, chapters, in, kannada, ...",0
2,"[do, musicians, get, royalties, from, youtube]",0
3,"[what, is, the, difference, between, scaling, ...",0
4,"[why, do, elevators, go, super, slow, right, b...",0


In [7]:
testPreprocessor = Preprocessor(test_df)
preprocessed_test_df = testPreprocessor.preprocess()
preprocessed_test_df.head()

  self.df["question_text"] = self.df["question_text"].str.replace("[^\w\s]", "")


(653061, 2)


Unnamed: 0,question_text
0,"[why, is, my, fish, tank, so, cloudy]"
1,"[are, aap, supportersleaders, hypocrites]"
2,"[can, you, still, get, a, ticket, if, you, shu..."
3,"[why, should, any, liberal, or, caring, person..."
4,"[how, can, i, know, who, got, into, my, pc, us..."


In [9]:
from gensim.models import word2vec
feature_size = 256
context_size = 5
min_word = 1

word_vec= word2vec.Word2Vec(preprocessed_df['question_text'], vector_size=feature_size, window=context_size, min_count=min_word, epochs=50, seed=42)

In [10]:
word_vec_unpack = [(word, idx) for word, idx in word_vec.wv.key_to_index.items()]

tokens, indexes = zip(*word_vec_unpack)

word_vec_df = pd.DataFrame(word_vec.wv.vectors[indexes, :], index=tokens)

word_vec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
the,0.157692,-1.962667,-0.021549,-1.304941,1.056042,-1.409629,1.633958,-0.239701,0.867131,1.081947,...,0.443057,1.110562,0.749143,0.64544,1.455143,-2.426267,-0.507591,0.627361,-0.553571,1.617401
what,-0.950905,-2.366135,-2.126145,0.122318,0.181524,-0.63559,2.056228,-0.545641,0.922501,1.454594,...,0.557625,1.733625,0.204131,0.319043,1.435537,-3.006604,-2.469166,0.142097,-0.286942,-0.000472
is,1.101763,-0.75533,-1.075515,0.272517,0.150855,-1.116584,-1.450344,1.098596,-2.823574,-0.606317,...,-2.363034,0.068018,0.527653,0.706851,2.12226,-1.562261,1.167268,-0.187608,-0.840015,-0.606366
a,-2.574324,-2.177396,-0.519794,0.873067,0.255089,0.362654,1.874123,-0.172567,0.138581,0.649002,...,0.283732,1.828638,0.355169,2.224137,-0.019704,-1.720095,-0.491752,-0.623703,0.114877,0.037144
to,-2.112787,-0.805857,-0.152196,-0.386873,1.179263,0.769944,0.988461,0.70504,-1.08002,0.141607,...,0.764955,-0.625984,3.905757,0.82879,0.657934,0.955136,1.083874,-0.787815,0.980592,1.571381


In [12]:
tokenized_array = np.array(preprocessed_df['question_text'])

model_array = np.array([word_vec_df.loc[doc].mean(axis=0) for doc in tokenized_array])

In [13]:
model_df = pd.DataFrame(model_array)
model_df["target"] = preprocessed_df["target"]

model_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,247,248,249,250,251,252,253,254,255,target
0,-0.046192,-0.871497,-0.419549,0.051906,0.202806,-0.606084,0.209398,0.071741,-0.685661,0.107178,...,0.569011,-0.120733,0.453698,0.941927,-1.131483,0.181631,0.11558,-0.154066,0.445269,0
1,-0.54081,-0.317784,-0.07423,0.189299,0.14913,-0.549548,0.549128,-0.092174,-0.336563,0.383196,...,0.214367,-0.543109,0.588254,0.154767,-0.51936,-0.71166,0.557392,0.435211,-0.386071,0
2,0.020519,-0.015671,-0.977683,-0.270581,0.91102,-0.025638,-0.023462,-0.125082,0.529428,0.042831,...,0.695944,-0.397305,-0.82924,0.231845,0.14183,-0.037864,0.282534,-0.18462,-0.592754,0
3,-0.0262,-0.217017,-0.387425,-0.095369,-0.404747,0.234138,0.114563,-0.325986,-0.273905,-0.341776,...,0.536481,-0.146647,0.674276,0.196746,-1.400912,-0.4784,-0.205022,-0.207162,0.522226,0
4,-0.688531,-0.207328,-1.43181,0.250045,-0.192462,0.053853,0.323942,-0.575614,0.449814,0.424915,...,0.452518,0.409565,-0.202967,0.703024,0.25234,-0.531481,0.274658,-0.384855,0.521392,0


In [14]:
y = model_df["target"]
X = model_df.drop(["target"], axis=1)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [22]:
X_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
631016,-0.770446,-0.005377,0.021556,-0.448783,-0.643888,-0.400822,0.187452,-0.155320,-0.161874,-0.046565,...,-1.043934,-0.551572,0.966854,-0.549450,0.039330,-0.641822,-0.431646,-0.466421,0.088508,0.680383
34150,0.109974,-1.061241,-0.366217,0.335971,0.501445,0.184031,-0.191152,0.425920,0.007570,-0.007113,...,-0.294545,1.052991,0.172658,0.261178,0.637437,-1.659547,-0.579092,0.054213,-0.285278,0.097135
213502,0.032301,-0.620044,-0.483492,0.136974,-0.117304,-0.220404,0.369797,-0.102097,-0.545628,0.184739,...,-0.432680,0.469448,0.624652,-0.255495,-0.272271,-1.260071,-0.109352,-0.140307,-0.486599,0.628950
96871,0.009917,-0.453236,-0.382034,0.411499,0.095838,0.207947,0.613289,-0.203824,0.438343,0.785015,...,0.085962,0.274461,0.590008,-0.164559,-0.649510,-0.053432,0.290500,-0.539831,-0.450243,0.464656
225411,0.169577,-0.299672,-0.398752,-0.113930,0.273852,0.407381,0.451407,-0.440717,0.185153,0.370212,...,0.178180,0.096549,0.754463,0.181592,0.159482,-0.016935,-1.160714,-0.146408,0.324497,0.039184
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
292379,-0.027913,-0.311410,-0.480998,0.234448,0.746714,-0.095122,1.005669,0.281209,0.738140,0.460017,...,0.751699,-0.056500,0.830714,0.009903,-0.141313,0.041298,-0.305492,-0.291452,0.370704,0.289716
422345,-0.341777,-0.504992,-0.792512,1.702541,-0.135418,0.688778,0.622097,-0.061926,0.344063,0.783900,...,0.322626,0.465585,0.752223,-0.258982,0.443352,0.277452,-0.683634,0.008056,-0.234258,0.098046
582487,-0.983406,-0.589381,-1.216957,0.423686,0.911840,-0.518576,1.078491,0.144351,1.373897,0.533781,...,0.623604,-0.246031,1.252538,0.242172,-1.768718,1.016639,-0.595163,-1.390513,0.596418,0.005012
501996,0.145803,-0.167092,0.104261,0.009808,-0.245619,-0.445027,-0.419308,0.297254,0.211868,0.208344,...,-0.855199,-0.060997,-0.103108,0.209435,0.044648,-1.325326,-0.226904,-0.142551,0.276801,-0.181245


In [20]:
log = LogisticRegression(solver='lbfgs', class_weight={1:4})
log.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(class_weight={1: 4})

In [30]:
y_hat = log.predict(X_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [32]:
X_test[np.isnan(X_test)] = np.median(X_test[~np.isnan(X_test)])
np.isnan(X_test).any()

0      True
1      True
2      True
3      True
4      True
       ... 
251    True
252    True
253    True
254    True
255    True
Length: 256, dtype: bool

In [33]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
r = RandomForestClassifier()
r.fit(X_train,y_train)
pred=r.predict(X_test)
pred = pred.astype(int)
pred

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').