In [15]:
# import ml packages

import string

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn.model_selection as model_selection
from sklearn.metrics import confusion_matrix


from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer

from sklearn.feature_extraction.text import (CountVectorizer, HashingVectorizer, TfidfVectorizer)
from imblearn.over_sampling import SMOTE

# from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, TruncatedSVD


from hyperopt import hp, tpe, fmin, Trials, STATUS_OK
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import scale, normalize
from sklearn.model_selection import cross_val_score

In [2]:
# import dataset
train_df = pd.read_csv("../AskReddit Dataset/AskReddit Dataset/train.csv")
test_df = pd.read_csv("../AskReddit Dataset/AskReddit Dataset/test.csv")
train_df.head()

Unnamed: 0,qid,question_text,target
0,a3dee568776c08512c89,What is the role of Lua in Civ4?,0
1,bdb84f519e7b46e7b7bb,What are important chapters in Kannada for 10 ...,0
2,29c88db470e2eb5c97ad,Do musicians get royalties from YouTube?,0
3,3387d99bf2c3227ae8f1,What is the difference between Scaling Social ...,0
4,e79fa5038f765d0f2e7e,Why do elevators go super slow right before th...,0


In [3]:
# choose elements from df where target = 1
pd.set_option("display.max_colwidth", 100)
df_1 = train_df[train_df["target"] == 1]
df_1["question_text"]

16                                                     What stupid things do Indians do when in your country?
31                             Can I sue my parents for giving birth to me when I did not want them to do so?
32                          What are your views about sexual relationship between a widow mother and her son?
33        You became an atheist, and after 2 years you fall and break your back. You are left paralyzed fr...
90                                    Why aren't we protesting for government control instead of gun control?
                                                         ...                                                 
652967              What is a liberal's understanding of the difference between pollution and climate change?
653021    Do unattractive or average-looking men ever get a girlfriend who actually loves them or do they ...
653029                                                                   How can I grab my aunties boobs! :p?
653034    

In [4]:
# see value count order of target
train_df["target"].value_counts()

0    612656
1     40405
Name: target, dtype: int64

In [5]:
# percentage of troll questions in the dataset
train_df["target"].value_counts(normalize=True)

0    0.93813
1    0.06187
Name: target, dtype: float64

In [8]:
# create a preprocessing class
class Preprocessor:
    def __init__(self, df) -> None:
        self.df = df

    # convert all charecters to lower case
    def convertToLower(self):
        self.df["question_text"] = self.df["question_text"].apply(lambda x: x.lower())
        return self.df

    # remove stop words
    def removeStopWords(self):
        stop = stopwords.words("english")
        self.df["question_text"] = self.df["question_text"].apply(
            lambda x: " ".join([word for word in x.split() if word not in stop])
        )
        return self.df

    # remove punctuation
    def removePunctuation(self):
        self.df["question_text"] = self.df["question_text"].str.replace("[^\w\s]", "")
        return self.df

    # remove numbers
    def removeNumbers(self):
        self.df["question_text"] = self.df["question_text"].str.replace("[0-9]", "")
        return self.df

    # remove whitespaces
    def removeWhitespaces(self):
        self.df["question_text"] = self.df["question_text"].apply(
            lambda x: " ".join(x.split())
        )
        return self.df

    # remove urls
    def removeURLs(self):
        self.df["question_text"] = self.df["question_text"].str.replace(
            "https?://\S+|www\.\S+", ""
        )
        return self.df

    # snowball stemmer algorithm
    def snowballstemmer(self):
        stemmer = SnowballStemmer()

        def stem_words(text):
            return " ".join([stemmer.stem(word) for word in text.split()])

        self.df["question_text"] = self.df["question_text"].apply(
            lambda x: stem_words(x)
        )
        return self.df

    # port stemmer algorithm
    def porterstemmer(self):
        stemmer = PorterStemmer()

        def stem_words(text):
            return " ".join([stemmer.stem(word) for word in text.split()])

        self.df["question_text"] = self.df["question_text"].apply(
            lambda x: stem_words(x)
        )
        return self.df

    # lemmatizing
    def lemmatize(self):
        from nltk.stem import WordNetLemmatizer

        lemmatizer = WordNetLemmatizer()

        def lemmatize_words(text):
            return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

        self.df["question_text"] = self.df["question_text"].apply(
            lambda x: lemmatize_words(x)
        )
        return self.df

    # remove id and index columns
    def removeUnwantedCols(self, col):
        print(self.df.shape)
        self.df = self.df.drop(col, axis=1)
        return self.df

    # word tokenization using nltk
    def wordTokenization(self):
        self.df["question_text"] = self.df["question_text"].apply(
            lambda x: nltk.word_tokenize(x)
        )
        return self.df
        

    def preprocess(self):
        self.df = self.convertToLower()
        # self.df = self.removeStopWords()
        # self.df = self.removePunctuation()
        # self.df = self.removeNumbers()
        # self.df = self.removeURLs()
        # self.df = self.removeWhitespaces()
        # self.df = self.snowballstemmer()
        # self.df = self.porterstemmer()
        # self.df = self.lemmatize()
        # self.df = self.wordTokenization()
        self.df = self.removeUnwantedCols(["qid"])
        return self.df

In [9]:
preproccesor = Preprocessor(train_df)
preprocessed_df = preproccesor.preprocess()
preprocessed_df.head()

(653061, 3)


Unnamed: 0,question_text,target
0,what is the role of lua in civ4?,0
1,what are important chapters in kannada for 10 icse 2018?,0
2,do musicians get royalties from youtube?,0
3,what is the difference between scaling social enterprises and social franchising?,0
4,why do elevators go super slow right before the doors open?,0


In [10]:
# get shape of preprocessed_df
preprocessed_df.shape

(653061, 2)

In [12]:
# create a get train and test data class
from nltk.tokenize import RegexpTokenizer

class TrainTestData:
    def __init__(self, trainDf, testDf) -> None:
        self.trainDf = trainDf
        self.testDf = testDf

    def doSmote(self):
        sm = SMOTE()
        self.X, self.Y = sm.fit_resample(self.X, self.Y)
        return self.trainData, self.testData

    def doDecomposition(self):
        lsa = TruncatedSVD(n_components=2)
        lsa.fit(self.X)
        self.trainData = lsa.transform(self.X)
        self.testData = lsa.transform(self.testData)
        

    def get_X(self, minDocumentCount):

        # concatinate trainDf and testDf
        # self.resampling()
        self.appendDf = pd.concat(
            [self.trainDf["question_text"], self.testDf["question_text"]], axis=0
        )

        token = RegexpTokenizer(r'[a-zA-Z0-9]+')
        vectorizer = CountVectorizer()
        #vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1,3),tokenizer=token.tokenize)
        # lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize
        vectorizer.fit(self.appendDf)

        self.trainData = vectorizer.transform(self.trainDf["question_text"])
        print(self.trainData.shape)

        self.testData = vectorizer.transform(self.testDf["question_text"])
        print(self.testData.shape)
        self.X = self.trainData

        # self.doDecomposition() 
        return self.X

    def resampling(self):
        from sklearn.utils import resample
        zero_data = self.trainDf[self.trainDf['target'] == 0]
        one_data = self.trainDf[self.trainDf['target'] == 1]
        self.trainDf = pd.concat([resample(zero_data, replace=True, n_samples=len(one_data)*6), one_data])
        return self.trainDf

    def get_Y(self):
        # self.resampling()
        self.Y = self.trainDf["target"]
        return self.Y

    def testTrainSplit(self):
        # self.doSmote()
        (
            self.X_train,
            self.X_test,
            self.Y_train,
            self.Y_test,
        ) = model_selection.train_test_split(
            self.X, self.Y, test_size=0.2, random_state=0
        )
        return self.X_train, self.X_test, self.Y_train, self.Y_test

    def get_X_test(self):
        return self.testData

In [13]:
testPreprocessor = Preprocessor(test_df)
preprocessed_test_df = testPreprocessor.preprocess()
preprocessed_test_df.head()

getTTData = TrainTestData(preprocessed_df, preprocessed_test_df)
X = getTTData.get_X(1)
y = getTTData.get_Y()
X_train, X_test, Y_train, Y_test = getTTData.testTrainSplit()

(653061, 2)
(653061, 195000)
(653061, 195000)


In [16]:
models = {
'logistic_regression' : LogisticRegression,
'knn' : KNeighborsClassifier,
'svc' : SVC}

In [17]:
def search_space(model):
    model = model.lower()
    space = {}
    if model == 'knn':
        space = {
            'n_neighbors': hp.choice('n_neighbors', range(1,100)),
            'scale': hp.choice('scale', [0, 1]),
            'normalize': hp.choice('normalize', [0, 1]),
            }
    elif model == 'svc':
        space = {
            'C': hp.uniform('C', 0, 20),
            'kernel': hp.choice('kernel', ['linear', 'sigmoid', 'poly', 'rbf']),
            'gamma': hp.uniform('gamma', 0, 20),
            'scale': hp.choice('scale', [0, 1]),
            'normalize': hp.choice('normalize', [0, 1]),
            }
    elif model == 'logistic_regression':
        space = {
            'warm_start' : hp.choice('warm_start', [True, False]),
            'fit_intercept' : hp.choice('fit_intercept', [True, False]),
            'tol' : hp.uniform('tol', 0.00001, 0.0001),
            'C' : hp.uniform('C', 0.05, 3),
            'solver' : hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear']),
            'max_iter' : hp.choice('max_iter', range(100,1000)),
            'scale': hp.choice('scale', [0, 1]),
            'normalize': hp.choice('normalize', [0, 1]),
            'multi_class' : 'auto',
            'class_weight' : 'balanced'
                }
    space['model'] = model
    return space

In [18]:
def get_acc_status(clf,X_,y):
    acc = cross_val_score(clf, X_, y, cv=5).mean()
    return {'loss': -acc, 'status': STATUS_OK}

In [19]:
def obj_fnc(params):
    model = params.get('model').lower()
    X_ = scale_normalize(params,X[:])
    del params['model']
    clf = models[model](**params)
    return(get_acc_status(clf,X_,y))

In [22]:
hypopt_trials = Trials()
best_params = fmin(obj_fnc, search_space(model), algo=tpe.suggest,
max_evals=1000, trials= hypopt_trials)
print(best_params)
print(hypopt_trials.best_trial['result']['loss'])

NameError: name 'model' is not defined