In [2]:
#import ml packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection as model_selection
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
import sklearn.preprocessing as preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import string

#import nltk packages
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

import xgboost as xgb

ModuleNotFoundError: No module named 'pandas'

In [2]:
#import dataset
train_df = pd.read_csv('../AskReddit Dataset/train.csv')
test_df = pd.read_csv('../AskReddit Dataset/test.csv')
train_df.head()

Unnamed: 0,qid,question_text,target
0,a3dee568776c08512c89,What is the role of Lua in Civ4?,0
1,bdb84f519e7b46e7b7bb,What are important chapters in Kannada for 10 ...,0
2,29c88db470e2eb5c97ad,Do musicians get royalties from YouTube?,0
3,3387d99bf2c3227ae8f1,What is the difference between Scaling Social ...,0
4,e79fa5038f765d0f2e7e,Why do elevators go super slow right before th...,0


In [3]:
# choose elements from df where target = 1
pd.set_option('display.max_colwidth', 100)
df_1 = train_df[train_df['target'] == 1]
df_1["question_text"]

16                                                     What stupid things do Indians do when in your country?
31                             Can I sue my parents for giving birth to me when I did not want them to do so?
32                          What are your views about sexual relationship between a widow mother and her son?
33        You became an atheist, and after 2 years you fall and break your back. You are left paralyzed fr...
90                                    Why aren't we protesting for government control instead of gun control?
                                                         ...                                                 
652967              What is a liberal's understanding of the difference between pollution and climate change?
653021    Do unattractive or average-looking men ever get a girlfriend who actually loves them or do they ...
653029                                                                   How can I grab my aunties boobs! :p?
653034    

In [4]:
# see value count order of target
train_df['target'].value_counts()

0    612656
1     40405
Name: target, dtype: int64

In [5]:
# percentage of troll questions in the dataset
train_df['target'].value_counts(normalize=True)

0    0.93813
1    0.06187
Name: target, dtype: float64

In [122]:
# create a preprocessing class to perform preprocessing
import re
class Preprocessor:
    
    def __init__(self,df) -> None:
        self.df = df
        # nltk.download('stopwords')

    def removePunctuation(self):
        self.df['question_text'] = self.df['question_text'].apply(lambda x: x.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))))
        return self.df


    def removeStopWords(self):
        stop = stopwords.words('english')
        self.df['question_text'] = self.df['question_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))
        return self.df

    def removeNumbers(self):
        self.df['question_text'] = self.df['question_text'].apply(lambda x: ' '.join([word for word in x.split() if not word.isnumeric()]))
        return self.df

    def removeShortWords(self):
        self.df['question_text'] = self.df['question_text'].apply(lambda x: ' '.join([word for word in x.split() if len(word) > 2]))
        return self.df

    def removeLowOccuranceWords(self):
        freq = pd.Series(' '.join(self.df['question_text']).split()).value_counts()
        low_freq = list(freq.loc[freq<5].index)
        self.df['question_text'] = self.df['question_text'].apply(lambda x: " ".join(x for x in x.split() if x not in low_freq))

    def removeUnwantedCols(self,col):
        self.df = self.df.drop(col,axis=1)
        return self.df

    def convertToLower(self):
        self.df['question_text'] = self.df['question_text'].apply(lambda x: x.lower())
        return self.df
    
    def removeNumbers2(self):
        self.df['question_text'] = self.df['question_text'].apply(lambda x: re.sub('W*dw*','',x))
        return self.df

    # stemmer algorithm
    def stemmer(self):
        from nltk.stem import PorterStemmer
        stemmer = PorterStemmer()
        def stem_words(text):
            return " ".join([stemmer.stem(word) for word in text.split()])  
        self.df["question_text"] = self.df["question_text"].apply(lambda x: stem_words(x))
        
    def lemmetizer(self):
        from nltk.stem import WordNetLemmatizer
        # nltk.download('wordnet')
        lemmatizer = WordNetLemmatizer()
        def lemmatize_words(text):
            return " ".join([lemmatizer.lemmatize(word) for word in text.split()])
        self.df["question_text"] = self.df["question_text"].apply(lambda text: lemmatize_words(text))
    
    def preprocess(self):
        self.removePunctuation()
        self.removeStopWords()
        self.removeNumbers()
        self.removeShortWords()
        self.convertToLower()
        self.removeUnwantedCols(['qid'])
        self.stemmer()
        # self.removeNumbers2()
        self.lemmetizer()
        return self.df

In [123]:
preprocessor = Preprocessor(train_df)
preprocessed_df = preprocessor.preprocess()
preprocessed_df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


Unnamed: 0,question_text,target
0,role lua civ4,0
1,import chapter kannada ic,0
2,musician get royalti youtub,0
3,differ scale social enterpris social franchis,0
4,elev super slow right door open,0


In [124]:
# do somewhat more preprocessing
# tokenize words using nltk
# nltk.download('punkt')
# preprocessed_df['question_text'] = preprocessed_df['question_text'].apply(lambda x: word_tokenize(x))

In [125]:
preprocessed_df.head()

Unnamed: 0,question_text,target
0,role lua civ4,0
1,import chapter kannada ic,0
2,musician get royalti youtub,0
3,differ scale social enterpris social franchis,0
4,elev super slow right door open,0


In [126]:

# create a count vectorizer object and fit transform df.question_text
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(preprocessed_df['question_text'])
# X_train_counts

In [127]:
# print(X_train_counts.shape)

In [128]:
# print(count_vect.get_feature_names_out())

In [129]:
# to convert the document term matrix to a dataframe -> Do only in AWS/Kaggle
#countvec_df = pd.DataFrame(X_train_counts.toarray(), columns=count_vect.get_feature_names())

In [130]:
train_df.shape

(653061, 3)

In [131]:
# create a get train and test data class
class TrainTestData:

    def __init__(self,df) -> None:
        self.df = df

    
    def get_X(self,minDocumentCount):
        count_vect = CountVectorizer()
        self.X = count_vect.fit_transform(self.df['question_text'])
        return self.X

    def get_Y(self):
        self.Y = self.df['target']
        return self.Y

    def testTrainSplit(self):
        self.X_train, self.X_test, self.Y_train, self.Y_test = model_selection.train_test_split(self.X, self.Y, test_size=0.2, random_state=0)
        return self.X_train, self.X_test, self.Y_train, self.Y_test

In [132]:
getTTData = TrainTestData(preprocessed_df)
X = getTTData.get_X(1)
y = getTTData.get_Y()

In [133]:
X.shape

(653061, 102118)

In [134]:
y.shape

(653061,)

In [135]:
y.value_counts()

0    612656
1     40405
Name: target, dtype: int64

In [136]:
X_train,X_test,Y_train,Y_test = getTTData.testTrainSplit()
Y_train.shape

(522448,)

In [137]:
X_train.shape

(522448, 102118)

# Perform Binary Classification using Logistic Regression

In [138]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=200)
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=200)

In [139]:
predictions = model.predict(X_test)

In [140]:
predictions

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

In [141]:
np.unique(predictions)

array([0, 1], dtype=int64)

In [142]:
y_actual = Y_test
cv_preds = model.predict(X_test)

from sklearn.metrics import f1_score
f1_score(y_actual, cv_preds, average='macro')

0.7333361299279836

In [143]:
from sklearn.metrics import fbeta_score
fbeta_score(y_actual, cv_preds, average='macro', beta=0.5)

0.7756365888446058

In [144]:
from sklearn.metrics import fbeta_score
fbeta_score(y_actual, cv_preds, average='micro', beta=0.5)

0.9497982589788152

In [145]:
# XGBModel = xgb.XGBClassifier()
# eval_set = [(X_train, Y_train), (X_test, Y_test)]     
   
# XGBModel.fit(X_train, Y_train, eval_metric="logloss", eval_set=eval_set)

# prediction = XGBModel.predict(X_test)
# np.unique(prediction)

# y_actual = Y_test
# cv_preds = prediction

# f1_score(y_actual, cv_preds, average='macro')

# fbeta_score(y_actual, cv_preds, average='macro', beta=0.5)

In [None]:
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
clf.fit(X_train, Y_train)

In [62]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
clf = make_pipeline(StandardScaler(with_mean=False), LinearSVC(random_state=0, tol=1e-5))
clf.fit(X_train, Y_train)



Pipeline(steps=[('standardscaler', StandardScaler(with_mean=False)),
                ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])

In [41]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=4, random_state=0)
clf.fit(X_train, Y_train)

RandomForestClassifier(max_depth=4, random_state=0)

In [42]:
np.unique(clf.predict(X_test))

array([0], dtype=int64)

In [57]:
from xgboost import XGBClassifier
model = XGBClassifier(eval_metric=make_scorer(fbeta_score, beta=0.5))
preds = model.fit(X_train, Y_train).predict(X_test)
np.unique(preds)



array([0, 1], dtype=int64)

In [58]:
fbeta_score(Y_test, preds, average='macro', beta=0.5)

0.7665068408284789

In [59]:
from sklearn.metrics import fbeta_score, make_scorer

In [60]:
fbeta_score(Y_test, preds, average='micro', beta=0.5)

0.9494384173091499

In [63]:
clf

Pipeline(steps=[('standardscaler', StandardScaler(with_mean=False)),
                ('linearsvc', LinearSVC(random_state=0, tol=1e-05))])

In [64]:
clf.predict(X_test)

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [67]:
fbeta_score(Y_test, clf.predict(X_test), average='macro', beta=0.5)

0.6932471703797167

In [None]:
# function to do a + b
def add(a,b):
    return a+b

# function to do a - b
def subtract(a,b):
    return a-b

# function to do a * b
def multiply(a,b):
    return a*b