In [2]:
import os
import pickle
import numpy as np
import sklearn
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, learning_curve
import matplotlib.pyplot as plt

plt.style.use('seaborn')

## Preparing Data
60% training, 20% validation, and 20% testing

In [3]:
# TODO: Move these two functions to a shared file for helper functions
def unPickleData(filename):
  with open(filename,'rb') as f: arr = pickle.load(f)
  return arr

def getDataPath(dirname,filename):
    cwd = os.getcwd()
    return os.path.join(cwd, os.path.join("processed_data", dirname,filename))

X_data = unPickleData(getDataPath("comments","X_comments.pkl"))
y_data = unPickleData(getDataPath("comments","Y_comments.pkl"))

# reducing amount of features
X, y = [], []
Class0_max_count = 38000
for i in range(len(X_data)):
    if y_data[i] == 0: 
        if Class0_max_count > 0:
            # print(i)
            X.append(X_data[i])
            y.append(y_data[i])
            if i%1000 == 0: print(i, end=" ")
            Class0_max_count -= 1 
    else:
        X.append(X_data[i])
        y.append(y_data[i])


X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.4, random_state=15, stratify=y)
X_val,   X_test, y_val,   y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state=15, stratify=y_temp)

0 1000 2000 3000 4000 6000 7000 8000 9000 10000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 24000 25000 26000 27000 28000 29000 30000 31000 32000 33000 34000 35000 36000 37000 38000 39000 40000 41000 42000 

## Training

In [4]:
model_lin = SVC(kernel="linear", verbose=True)
# model_lin.fit(X_train, y_train)


train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(model_lin, 
                                                                    X_train, 
                                                                    y_train, 
                                                                    scoring='accuracy', 
                                                                    cv=5, 
                                                                    return_times=True) 

[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [None]:
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.subplots(1, figsize=(10,10))
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#D22B2B", label="Cross-validation score")

plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#ffcdcc")

# plt.ylim([0, 1])
plt.xlim([0, train_sizes[-1]])
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

In [None]:
model_poly = SVC(kernel="poly", verbose=True)
# model_poly.fit(X_train,y_train)

train_sizes, train_scores, test_scores, fit_times, _ = learning_curve(model_lin, 
                                                                    X_train, 
                                                                    y_train, 
                                                                    scoring='accuracy', 
                                                                    cv=5, 
                                                                    return_times=True) 

In [None]:
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.subplots(1, figsize=(10,10))
plt.plot(train_sizes, train_mean, '--', color="#111111",  label="Training score")
plt.plot(train_sizes, test_mean, color="#D22B2B", label="Cross-validation score")

plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="#DDDDDD")
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="#ffcdcc")

# plt.ylim([0, 1])
plt.xlim([0, train_sizes[-1]])
plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best")
plt.tight_layout()
plt.show()

## Model Selection

In [None]:
grid = {"C":np.logspace(-3,3,100)}

search = GridSearchCV(model_lin,grid, cv=10)
search.fit(X_val,y_val)

print("Tuned hpyerparameters :(best parameters) ",search.best_params_)
print("Accuracy :",search.best_score_)

best_model_lin = search.best_estimator_

In [None]:
grid = {'C':np.logspace(-3,3,10), 'degree':[1,2,3,4,5], 'gamma':['scale', 'auto'], 'coef0':np.logspace(0,1,10)}

search = GridSearchCV(model_poly,grid,cv=10)
search.fit(X_val, y_val)

print("Tuned hpyerparameters :(best parameters) ",search.best_params_)
print("Accuracy :",search.best_score_)

best_model_poly = search.best_estimator_

## Testing

### Excluding the validation set

In [1]:
y_pred = best_model_lin.predict(X_test)

print(sklearn.metrics.classification_report(y_test,y_pred))
print("Accuracy: ", sklearn.metrics.accuracy_score(y_test, y_pred))

NameError: name 'best_model_lin' is not defined

In [None]:
y_pred = best_model_poly.predict(X_test)

print(sklearn.metrics.classification_report(y_test,y_pred))
print("Accuracy: ", sklearn.metrics.accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.99      0.92       229
           1       0.82      0.29      0.43        48

    accuracy                           0.87       277
   macro avg       0.85      0.64      0.68       277
weighted avg       0.86      0.87      0.84       277

Accuracy:  0.8664259927797834


### Including the validation set
Because of the lack of representation of the class "bot", especially in the testing data, the model reflects poor results when it comes to the recall and f-score.

In [None]:
y_pred = best_model_lin.predict(np.concatenate((X_test,X_val))
)
print(sklearn.metrics.classification_report(np.concatenate((y_test,y_val)),y_pred))
print("Accuracy: ", sklearn.metrics.accuracy_score(np.concatenate((y_test,y_val)), y_pred))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94       459
           1       0.93      0.43      0.58        94

    accuracy                           0.90       553
   macro avg       0.91      0.71      0.76       553
weighted avg       0.90      0.90      0.88       553

Accuracy:  0.8969258589511754


In [None]:
y_pred = best_model_poly.predict(np.concatenate((X_test,X_val))
)
print(sklearn.metrics.classification_report(np.concatenate((y_test,y_val)),y_pred))
print("Accuracy: ", sklearn.metrics.accuracy_score(np.concatenate((y_test,y_val)), y_pred))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95       459
           1       0.92      0.50      0.65        94

    accuracy                           0.91       553
   macro avg       0.91      0.75      0.80       553
weighted avg       0.91      0.91      0.90       553

Accuracy:  0.9077757685352622
