In [3]:
import os
import pickle
import numpy as np
import sklearn
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, learning_curve

## Preparing Data
60% training, 20% validation, and 20% testing

In [None]:
# TODO: Move these two functions to a shared file for helper functions
def unPickleData(filename):
  with open(filename,'rb') as f: arr = pickle.load(f)
  return arr

def getDataPath(dirname,filename):
  return os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)),"processed_data", dirname,filename)

X = unPickleData(getDataPath("comments","X_comments.pkl"))
y = unPickleData(getDataPath("comments","Y_comments.pkl"))

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.4, random_state=15, stratify=y)
X_val,   X_test, y_val,   y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state=15, stratify=y_temp)

In [7]:
# Remove both lists to save up  memory
X = np.load("X_comment.npy")
y = np.load("y_comment.npy")

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.4, random_state=15, stratify=y)
X_val,   X_test, y_val,   y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state=15, stratify=y_temp)
number_features = len(X_train[0])


del X
del y

## Training

In [3]:
model_poly = SVC(kernel="poly", verbose=True)
model_poly.fit(X_train,y_train)

[LibSVM]

SVC(kernel='poly', verbose=True)

In [5]:
with open("svm_model_poly.pkl", 'wb') as file:
    pickle.dump(model_poly, file)

In [5]:
with open("svm_model_poly.pkl", 'rb') as file:
    model_poly = pickle.load(file)

In [None]:
train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(model_poly, 
                                                                    X_train, 
                                                                    y_train, 
                                                                    # scoring='accuracy', 
                                                                    cv=5, 
                                                                    return_times=True)

## Model Selection

In [None]:
grid = {"C":np.logspace(-3,3,100)}

search = GridSearchCV(model_poly,grid, cv=10)
search.fit(X_val,y_val)

print("Tuned hpyerparameters :(best parameters) ",search.best_params_)
print("Accuracy :",search.best_score_)

best_model_lin = search.best_estimator_

In [None]:
grid = {'C':np.logspace(-3,3,10), 'degree':[1,2,3,4,5], 'gamma':['scale', 'auto'], 'coef0':np.logspace(0,1,10)}

search = GridSearchCV(model_poly,grid,cv=10)
search.fit(X_val, y_val)

print("Tuned hpyerparameters :(best parameters) ",search.best_params_)
print("Accuracy :",search.best_score_)

best_model_poly = search.best_estimator_

## Testing

### Excluding the validation set

In [4]:
# y_pred = best_model_poly.predict(X_test)
y_pred = model_poly.predict(X_test)

print(sklearn.metrics.classification_report(y_test,y_pred))
print("Accuracy: ", sklearn.metrics.accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94      7501
           1       0.99      0.65      0.79      2618

    accuracy                           0.91     10119
   macro avg       0.94      0.82      0.86     10119
weighted avg       0.92      0.91      0.90     10119

Accuracy:  0.9076983891688902


### Including the validation set
Because of the lack of representation of the class "bot", especially in the testing data, the model reflects poor results when it comes to the recall and f-score.

In [8]:
y_pred = model_poly.predict(np.concatenate((X_test,X_val)))
print(sklearn.metrics.classification_report(np.concatenate((y_test,y_val)),y_pred))
print("Accuracy: ", sklearn.metrics.accuracy_score(np.concatenate((y_test,y_val)), y_pred))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     15001
           1       0.99      0.65      0.78      5237

    accuracy                           0.91     20238
   macro avg       0.94      0.82      0.86     20238
weighted avg       0.92      0.91      0.90     20238

Accuracy:  0.9068583852159304
