In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
random = 14 #Fix Random_State

from nltk.stem import WordNetLemmatizer #Group Words with Same Form to Same Word
import nltk
## Remove the Comments if they are Not Installed in your Environment.
## nltk.download('stopwords')
## nltk.download('wordnet')
## nltk.download('omw-1.4')
from nltk.corpus import stopwords #Stopwords refers to Words that are going to be Ignored
import string
import operator
import functools

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score,classification_report,confusion_matrix,f1_score

import warnings
warnings.filterwarnings('ignore')

# Import and Preprocess Datasets

1. Read `train.csv` and `test.csv` and combine them into one dataset `D`.
2. Force the target value into a binary value.
3. Perform text preprocessing: Lower Casing, Counting number of words etc.
4. Lemmatizing Strings.
5. From `D` perform train-test-split to create training and testing dataset. (Step 1 and 5 was done to rearrange the size of 2 datasets.)

In [2]:
## Dataset Paths
path = '../data/'
train_path = f'{path}train.csv'
test_path = f'{path}test.csv'
test_labels_path = f'{path}test_labels.csv'
train_aug_path = f'{path}train_augmented_synonym.csv'

## Reading Datasets
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
test_labels = pd.read_csv(test_labels_path)[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]
test = pd.concat([test, test_labels],axis=1)
test = test.loc[test['toxic'] != -1]

D = pd.concat([train, test], axis=0)

## Create New Dependent Variable - Malignant for D (train and test set only; augmented train dataset is already in the correct format)
def force_one(x):
    if x > 1:
        return 1
    else:
        return x

D['malignant'] = D['toxic'] + D['obscene'] + D['threat'] + D['insult'] + D['identity_hate']
D['malignant'] = D['malignant'].apply(lambda x: force_one(x))
D = D[['id','comment_text','malignant']]

D.head()

## Text Pre-Processing - D

#### Make Strings to Lower Case
D['comment_text'] = D['comment_text'].str.lower()

#### Keep Track of String's Original Length
D['length'] = D['comment_text'].str.len()

#### Replace Email Address with 'email'
D['comment_text'] = D['comment_text'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','email')

#### Replace Website Address with 'website'
D['comment_text'] = D['comment_text'].str.replace(r'^http[s]{0,1}\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','website')

#### Replace Website Address with 'phonenumber'
D['comment_text'] = D['comment_text'].str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumber') # Note, might contain a random 10 digit number.

#### Replace Numbers with 'numbrs'
D['comment_text'] = D['comment_text'].str.replace(r'\d+(\.\d+)?', 'numbr')

#### Special Punctuations are Replaced Explicitly.
D['comment_text'] = D['comment_text'].str.replace(r'!',' exclamationmark')
D['comment_text'] = D['comment_text'].str.replace(r'\?',' questionmark')
D['comment_text'] = D['comment_text'].str.replace(r'\.{1}',' periodmark')
D['comment_text'] = D['comment_text'].str.replace(r'\.{2,}',' ellipsismark')
D['comment_text'] = D['comment_text'].str.replace(r'£|\$', ' dollers')

## Removing Leftover Punctuations
def remove_punct(text):
    p_free="".join([i for i in text if i not in string.punctuation])
    return p_free
stop_words = set(stopwords.words('english'))

D['comment_text'] = D['comment_text'].apply(lambda x:remove_punct(x))

D['comment_text'] = D['comment_text'].apply(lambda x:remove_punct(x))
D['comment_text'] = D['comment_text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

lemmatizer = WordNetLemmatizer()
D['comment_text'] = D['comment_text'].apply(lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))

X = D['comment_text']
y = D['malignant']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random)

Since having an unbalanced dataset might cause the problems such as insufficient learning about lesser target value etc., a balanced dataset was created via undersampling. Since it decreases the data size, it might not help, but it was tried.

In [5]:
#Prepare balanced dataset obtained via undersampling

nonmalig = X_train[y_train == 0]
nonmalig_y = y_train[y_train == 0]
num_nonmalig = len(nonmalig)
malig = X_train[y_train == 1]
malig_y = y_train[y_train == 1]
num_malig = len(malig)

#chosen_idx = np.random.choice(num_nonmalig,replace=False,size=num_malig)
chosen_idx_path = 'data/chosen_idx.csv'
chosen_idx = pd.read_csv(chosen_idx_path)
chosen_idx = np.array(chosen_idx['0'])
chosen_nonmalig = nonmalig.iloc[chosen_idx]
chosen_nonmalig_y = np.array(y_train)[chosen_idx]

X_train_un = pd.concat([chosen_nonmalig, malig],axis=0)
y_train_un = np.concatenate((chosen_nonmalig_y, malig_y), axis=None)

undersampled_num = len(y_train_un)

mix_idx_path = f'{path}mix_idx.csv'
mix_idx = pd.read_csv(mix_idx_path)
mix_idx = np.array(mix_idx['0'])

X_train_un = X_train_un.iloc[mix_idx]
y_train_un = y_train_un[mix_idx]


Vertorizing list of words to train Decision Tree Model using `TfidfVectorizer`.

In [6]:
# Changing Words into Vector - Something Like One Hot Encoding
word_vectorize = TfidfVectorizer(max_features = 20000, stop_words='english')
X_train = word_vectorize.fit_transform(X_train)
X_test_copy = X_test.copy()
X_test = word_vectorize.transform(X_test)

word_vectorize = TfidfVectorizer(max_features = 20000, stop_words='english')
X_train_un = word_vectorize.fit_transform(X_train_un)
X_test_un = word_vectorize.transform(X_test_copy)

word_vectorize = TfidfVectorizer(max_features = 20000, stop_words='english')
X_train_aug = word_vectorize.fit_transform(train_aug)
X_test_aug = word_vectorize.transform(X_test_copy)


# Training K-Nearest Neighbors Model for Baseline

## Using Original Unbalanced Dataset

In [None]:
# KNN for Unbalanced Dataset

knn = KNeighborsClassifier()

params = {
    'n_neighbors': [10,50,100,200,300,400], 
    'weights' : ['uniform', 'distance'], 
    'metric' : ['euclidean','manhattan', 'cosine']}

grid_search  = GridSearchCV(estimator=knn,
                           cv=5,
                           param_grid=params,
                           n_jobs=-1,
                           verbose=10,
                           scoring="f1_macro")

grid_search.fit(X_train, y_train)
print(grid_search.best_estimator_)
print(grid_search.best_params_)

In [9]:
# results = pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')
# results.to_csv('KNNresults.csv')

The best model parameter found by GridSearch for KNN and Unbalanced Dataset is 
* n_neighbors = 10
* weights = distance
* metric = cosine

The Final Test Score run on test set was:
* Accuracy: 0.93
* F1-Macro Score: 0.7728

In [None]:
#results using top parameters (unbalanced)
print(grid_search.best_params_)
clf = grid_search.best_estimator_
clf.fit(X_train, y_train)
y_pred_knn = clf.predict(X_test)
print('[KNN] f1 macro score of Unbalanced Dataset is {}'.format(f1_score(y_test, y_pred_knn, average = 'macro')))
print(confusion_matrix(y_test,y_pred_knn))
print(classification_report(y_test,y_pred_knn))

In [None]:
from sklearn.model_selection import StratifiedKFold

cv = 10
skf = StratifiedKFold(n_splits=cv)
scores = []
for i, (train_index, test_index) in enumerate(skf.split(X_train,y_train)):
    X_t = X_train[train_index]
    X_v = X_train[test_index]
    y_t = y_train.iloc[train_index]
    y_v = y_train.iloc[test_index]
    clf = KNeighborsClassifier(metric = 'cosine', n_neighbors = 10, weights = 'distance') #using best parameters
    clf.fit(X_t, y_t)
    y_pred = clf.predict(X_v)
    score = f1_score(y_v, y_pred, average = 'macro')
    scores.append(score)

knn_mean = np.mean(scores)
knn_std = np.std(scores)
knn_ci = [knn_mean - 3*knn_std, knn_mean + 3*knn_std] ## Includes 99.7% numbers around est. mean.

print("Mean CV Score: {}".format(knn_mean))
print("Standard Deviation CV Score: {}".format(knn_std))
print("99.7% Confidence Interval: {}".format(knn_ci))

## Using Undersampled Dataset

In [None]:
# KNN for Undersampled Balanced Dataset

knn = KNeighborsClassifier()

params = {
    'n_neighbors': [10,50,100,200,300,400], #in intervals of 10
    'weights' : ['uniform', 'distance'], #uniform equivalent to no weights
    'metric' : ['euclidean','manhattan', 'cosine']}

grid_search  = GridSearchCV(estimator=knn,
                           cv = 5,
                           param_grid=params,
                           verbose=10,
                           scoring="f1_macro")

grid_search.fit(X_train_un, y_train_un)
print(grid_search.best_estimator_)
print(grid_search.best_params_)

In [None]:
# results = pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')
# results.to_csv('KNNUndersampledResults.csv')

The best model parameter found by GridSearch for KNN and Undersampled Dataset is 
* n_neighbors = 200
* weights = distance
* metric = euclidean

The Final Test Score run on test set was:
* Accuracy: 0.93
* F1-Macro Score: 0.7806

In [None]:
#results using top parameters (undersampled balanced)

print(grid_search.best_params_)
clf = grid_search.best_estimator_
clf.fit(X_train_un, y_train_un)
y_bal_pred_knn = clf.predict(X_test_un)
print('[DT] f1 macro score of Undersampled Balanced Dataset is {}'.format(f1_score(y_test, y_bal_pred_knn, average = 'macro')))
print(confusion_matrix(y_test,y_bal_pred_knn))
print(classification_report(y_test,y_bal_pred_knn))

In [None]:
cv = 10
skf = StratifiedKFold(n_splits=cv)
scores = []
for i, (train_index, test_index) in enumerate(skf.split(X_train_un,y_train_un)):
    X_t = X_train_un[train_index]
    X_v = X_train_un[test_index]
    y_t = y_train_un[train_index]
    y_v = y_train_un[test_index]
    clf = KNeighborsClassifier(metric = 'euclidean', n_neighbors = 200, weights = 'distance')
    clf.fit(X_t, y_t)
    y_pred = clf.predict(X_v)
    score = f1_score(y_v, y_pred, average = 'macro')
    scores.append(score)

In [None]:
knn_un_mean = np.mean(scores)
knn_un_std = np.std(scores)
knn_un_ci = [knn_un_mean - 3*knn_un_std, knn_un_mean + 3*knn_un_std] ## Includes 99.7% numbers around est. mean.

In [None]:
print("Mean CV Score: {}".format(knn_un_mean))
print("Standard Deviation CV Score: {}".format(knn_un_std))
print("99.7% Confidence Interval: {}".format(knn_un_ci))