# Import Libraries

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
random = 14 #Fix Random_State

from nltk.stem import WordNetLemmatizer # Group Words with Same Form to Same Word
import nltk
## Remove the Comments if they are Not Installed in your Environment.
## nltk.download('stopwords',quiet=True)
## nltk.download('wordnet',quiet=True)
## nltk.download('omw-1.4',quiet=True)
from nltk.corpus import stopwords #Stopwords refers to Words that are going to be Ignored
import string
import operator
import functools

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score,classification_report,confusion_matrix,f1_score

from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')

# Import and Preprocess Datasets

1. Read `train.csv` and `test.csv` and combine them into one dataset `D`.
2. Force the target value into a binary value.
3. Perform text preprocessing: Lower Casing, Counting number of words etc.
4. Lemmatizing Strings.
5. From `D` perform train-test-split to create training and testing dataset. (Step 1 and 5 was done to rearrange the size of 2 datasets.)

In [None]:
## Dataset Paths
path = '../data/'
train_path = f'{path}train.csv'
test_path = f'{path}test.csv'
test_labels_path = f'{path}test_labels.csv'

## Reading Datasets
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
test_labels = pd.read_csv(test_labels_path)[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
test = pd.concat([test, test_labels],axis=1)
test = test.loc[test['toxic'] != -1] # Labels of value -1 indicates that it was not labelled and cannot be used to derive a test score.

D = pd.concat([train, test], axis=0)

## Create New Dependent Variable - Malignant for D 
def force_one(x):
    if x > 1:
        return 1
    else:
        return x

D['malignant'] = D['toxic'] + D['obscene'] + D['threat'] + D['insult'] + D['identity_hate']
D['malignant'] = D['malignant'].apply(lambda x: force_one(x))
D = D[['id','comment_text','malignant']] # Forced into binary target.

D.head()

## Text Pre-Processing - D

#### Make Strings to Lower Case
D['comment_text'] = D['comment_text'].str.lower()

#### Keep Track of String's Original Length
D['length'] = D['comment_text'].str.len()

#### Replace Email Address with 'email'
D['comment_text'] = D['comment_text'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','email')

#### Replace Website Address with 'website'
D['comment_text'] = D['comment_text'].str.replace(r'^http[s]{0,1}\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','website')

#### Replace Website Address with 'phonenumber'
D['comment_text'] = D['comment_text'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumber') # Note, might contain a random 10 digit number.

#### Replace Numbers with 'numbr'
D['comment_text'] = D['comment_text'].str.replace(r'\d+(\.\d+)?', 'numbr')

#### Special Punctuations are Replaced Explicitly.
D['comment_text'] = D['comment_text'].str.replace(r'!',' exclamationmark')
D['comment_text'] = D['comment_text'].str.replace(r'\?',' questionmark')
D['comment_text'] = D['comment_text'].str.replace(r'\.{1}',' periodmark')
D['comment_text'] = D['comment_text'].str.replace(r'\.{2,}',' ellipsismark')
D['comment_text'] = D['comment_text'].str.replace(r'£|\$', ' dollers')

## Removing Leftover Punctuations
def remove_punct(text):
    p_free="".join([i for i in text if i not in string.punctuation])
    return p_free
stop_words = set(stopwords.words('english'))

D['comment_text'] = D['comment_text'].apply(lambda x:remove_punct(x))

D['comment_text'] = D['comment_text'].apply(lambda x:remove_punct(x))
D['comment_text'] = D['comment_text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

lemmatizer = WordNetLemmatizer()
D['comment_text'] = D['comment_text'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))

X = D['comment_text']
y = D['malignant']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random)

Since having an unbalanced dataset might cause the problems such as insufficient learning about lesser target value etc., a balanced dataset was created via undersampling. Since it decreases the data size, it might not help, but it was tried.

In [3]:
# Prepare balanced dataset obtained via undersampling
nonmalig = X_train[y_train == 0]
nonmalig_y = y_train[y_train == 0]
num_nonmalig = len(nonmalig)
malig = X_train[y_train == 1]
malig_y = y_train[y_train == 1]
num_malig = len(malig)

#chosen_idx = np.random.choice(num_nonmalig,replace=False,size=num_malig)
chosen_idx_path = '../data/chosen_idx.csv' # chosen_idx.csv is a list of index that was randomly chosen in prior to effectively undersample the non-malignant data.
chosen_idx = pd.read_csv(chosen_idx_path)
chosen_idx = np.array(chosen_idx['0'])
chosen_nonmalig = nonmalig.iloc[chosen_idx]
chosen_nonmalig_y = np.array(y_train)[chosen_idx]

X_train_un = pd.concat([chosen_nonmalig, malig],axis=0)
y_train_un = np.concatenate((chosen_nonmalig_y, malig_y), axis=None)

undersampled_num = len(y_train_un)

mix_idx_path = f'{path}mix_idx.csv'
mix_idx = pd.read_csv(mix_idx_path)
mix_idx = np.array(mix_idx['0'])

X_train_un = X_train_un.iloc[mix_idx]
y_train_un = y_train_un[mix_idx]


Vertorizing list of words to train Decision Tree Model using `TfidfVectorizer`.

In [4]:
# Changing Words into Vector - Something Like One Hot Encoding
word_vectorize = TfidfVectorizer(max_features = 20000, stop_words='english')
X_train = word_vectorize.fit_transform(X_train)
X_test_copy = X_test.copy()
X_test = word_vectorize.transform(X_test)

word_vectorize = TfidfVectorizer(max_features = 20000, stop_words='english')
X_train_un = word_vectorize.fit_transform(X_train_un)
X_test_un = word_vectorize.transform(X_test_copy)

# Training Decision Tree Model for Baseline

## Using Original Unbalanced Dataset

In [None]:
# DT for Unbalanced Dataset

clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf.tree_.max_depth) # 3470 -> Therefore, we only have test the depth until 3500 to test regularization effect.

DT = DecisionTreeClassifier(random_state=random)

params = {
    'criterion': ["gini", "entropy", "log_loss"],
    'max_depth': [5, 50, 500, 1000, 2000, 3000, 3500, None],
    'min_samples_split': [2, 10, 20, 50, 100, 1000]
}

grid_search = GridSearchCV(estimator=DT,
                           param_grid=params,
                           cv=5,
                           n_jobs=-1,
                           verbose=10,
                           scoring="f1_macro")

grid_search.fit(X_train, y_train)

In [5]:
# results = pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')
# results.to_csv('DTresults.csv') # Store the results later reference.

The best model parameter found by GridSearch for Decision Tree and Unbalanced Dataset is 
* criterion: "gini"
* max_depth = 50
* min_samples_split = 1000

The Final Test Score run on test set was:
* Accuracy: 0.94
* F1-Macro Score: 0.8156

In [None]:
#results using top parameters (unbalanced)
print(grid_search.best_params_)
clf = grid_search.best_estimator_
clf.fit(X_train, y_train)
y_pred_dt = clf.predict(X_test)
print('[DT] f1 macro score of Unbalanced Dataset is {}'.format(f1_score(y_test, y_pred_dt, average = 'macro')))
print(confusion_matrix(y_test,y_pred_dt))
print(classification_report(y_test,y_pred_dt))

In [None]:
cv = 10
skf = StratifiedKFold(n_splits=cv)
scores = []
for i, (train_index, test_index) in enumerate(skf.split(X_train,y_train)):
    X_t = X_train[train_index]
    X_v = X_train[test_index]
    y_t = y_train.iloc[train_index]
    y_v = y_train.iloc[test_index]
    clf = DT = DecisionTreeClassifier(criterion = 'gini', max_depth = 50, min_samples_split = 1000)
    clf.fit(X_t, y_t)
    y_pred = clf.predict(X_v)
    score = f1_score(y_v, y_pred, average = 'macro')
    scores.append(score)

In [None]:
dt_ori_mean = np.mean(scores)
dt_ori_std = np.std(scores)
dt_ori_ci = [dt_ori_mean - 3*dt_ori_std, dt_ori_mean + 3*dt_ori_std] ## Includes 99.7% numbers around est. mean.

In [None]:
print("Mean CV Score: {}".format(dt_ori_mean))
print("Standard Deviation CV Score: {}".format(dt_ori_std))
print("99.7% Confidence Interval: {}".format(dt_ori_ci))

## Using Undersampled Dataset

In [None]:
# DT for Balanced Dataset obtained via undersampling

DT_bal = DecisionTreeClassifier(random_state=87)

params = {
    'criterion': ["gini", "entropy", "log_loss"],
    'max_depth': [5, 50, 500, 1000, 2000, 3000, 3500, None],
    'min_samples_split': [2, 10, 20, 50, 100, 1000]
}

grid_search = GridSearchCV(estimator=DT_bal,
                           param_grid=params,
                           cv=5,
                           n_jobs=-1,
                           verbose=10,
                           scoring="f1_macro")

grid_search.fit(X_train_un, y_train_un)

In [33]:
# results = pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')
# results.to_csv('DTUndersampledResults.csv')

The best model parameter found by GridSearch for Decision Tree and Undersampled Dataset is 
* criterion: "entropy"
* max_depth = 500
* min_samples_split = 1000,

The Final Test Score run on test set was:
* Accuracy: 0.83
* F1-Macro Score: 0.6909

In [None]:
#results using top parameters (undersampled balanced)
print(grid_search.best_params_)
clf = grid_search.best_estimator_
clf.fit(X_train_un, y_train_un)
y_bal_pred_dt = clf.predict(X_test)
print('[DT] f1 macro score of Undersampled Balanced Dataset is {}'.format(f1_score(y_test, y_bal_pred_dt, average = 'macro')))
print(confusion_matrix(y_test,y_bal_pred_dt))
print(classification_report(y_test,y_bal_pred_dt))

In [8]:
cv = 10
skf = StratifiedKFold(n_splits=cv)
scores = []
for i, (train_index, test_index) in enumerate(skf.split(X_train_un,y_train_un)):
    X_t = X_train_un[train_index]
    X_v = X_train_un[test_index]
    y_t = y_train_un[train_index]
    y_v = y_train_un[test_index]
    clf = DecisionTreeClassifier(criterion='entropy', max_depth=500, min_samples_split=1000)
    clf.fit(X_t, y_t)
    y_pred = clf.predict(X_v)
    score = f1_score(y_v, y_pred, average = 'macro')
    scores.append(score)

In [9]:
dt_un_mean = np.mean(scores)
dt_un_std = np.std(scores)
dt_un_ci = [dt_un_mean - 3*dt_un_std, dt_un_mean + 3*dt_un_std] ## Includes 99.7% numbers around est. mean.

In [None]:
print("Mean CV Score: {}".format(dt_un_mean))
print("Standard Deviation CV Score: {}".format(dt_un_std))
print("99.7% Confidence Interval: {}".format(dt_un_ci))