In [8]:
# import machine learning libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from nltk.corpus import stopwords

import string

In [9]:
# read csv file using pandas
df = pd.read_csv('AskReddit Dataset/train.csv')
df.head()

Unnamed: 0,qid,question_text,target
0,a3dee568776c08512c89,What is the role of Lua in Civ4?,0
1,bdb84f519e7b46e7b7bb,What are important chapters in Kannada for 10 ICSE 2018?,0
2,29c88db470e2eb5c97ad,Do musicians get royalties from YouTube?,0
3,3387d99bf2c3227ae8f1,What is the difference between Scaling Social Enterprises and Social Franchising?,0
4,e79fa5038f765d0f2e7e,Why do elevators go super slow right before the doors open?,0


In [10]:
# choose elements from df where target = 1
pd.set_option('display.max_colwidth', 100)
df_1 = df[df['target'] == 1]
df_1["question_text"]

16                                                     What stupid things do Indians do when in your country?
31                             Can I sue my parents for giving birth to me when I did not want them to do so?
32                          What are your views about sexual relationship between a widow mother and her son?
33        You became an atheist, and after 2 years you fall and break your back. You are left paralyzed fr...
90                                    Why aren't we protesting for government control instead of gun control?
                                                         ...                                                 
652967              What is a liberal's understanding of the difference between pollution and climate change?
653021    Do unattractive or average-looking men ever get a girlfriend who actually loves them or do they ...
653029                                                                   How can I grab my aunties boobs! :p?
653034    

In [11]:
# see value count order of target
df['target'].value_counts()

0    612656
1     40405
Name: target, dtype: int64

In [12]:
# percentage of troll questions in the dataset
df['target'].value_counts(normalize=True)

0    0.93813
1    0.06187
Name: target, dtype: float64

In [23]:
# create a preprocessing class to perform preprocessing
class Preprocessor:
    
    def __init__(self,df) -> None:
        self.df = df
        # import nltk
        # nltk.download('stopwords')

    def removePunctuation(self):
        self.df['question_text'] = self.df['question_text'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
        return self.df


    def removeStopWords(self):
        stop = stopwords.words('english')
        self.df['question_text'] = self.df['question_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))
        return self.df

    def removeNumbers(self):
        self.df['question_text'] = self.df['question_text'].apply(lambda x: ' '.join([word for word in x.split() if not word.isnumeric()]))
        return self.df

    def removeShortWords(self):
        self.df['question_text'] = self.df['question_text'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))
        return self.df

    def removeLowOccuranceWords(self):
        freq = pd.Series(' '.join(self.df['question_text']).split()).value_counts()
        low_freq = list(freq.loc[freq<5].index)
        self.df['question_text'] = self.df['question_text'].apply(lambda x: " ".join(x for x in x.split() if x not in low_freq))

    
    def preprocess(self):
        self.removePunctuation()
        self.removeStopWords()
        self.removeNumbers()
        self.removeShortWords()
        return self.df

In [24]:
preprocessor = Preprocessor(df)
preprocessed_df = preprocessor.preprocess()
preprocessed_df.head()

Unnamed: 0,qid,question_text,target
0,a3dee568776c08512c89,What role Lua Civ4,0
1,bdb84f519e7b46e7b7bb,What important chapters Kannada ICSE,0
2,29c88db470e2eb5c97ad,musicians get royalties YouTube,0
3,3387d99bf2c3227ae8f1,What difference Scaling Social Enterprises Social Franchising,0
4,e79fa5038f765d0f2e7e,Why elevators super slow right doors open,0
