In [1]:
import sklearn
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import numpy as np
import pickle
import os

Load the Data

In [2]:
def unPickleData(filename):
  with open(filename,'rb') as f: arr = pickle.load(f)
  return arr

def getDataPath(dirname,filename):
  return os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)),"processed_data", dirname,filename)

x_data = unPickleData(getDataPath("posts","X_posts.pkl"))
y_data = unPickleData(getDataPath("posts","Y_posts.pkl"))

In [3]:
X, y = [], []
Class0_max_count = 20000
for i in range(len(x_data)):
    if y_data[i] == 0: 
        if Class0_max_count > 0:
            X.append(x_data[i])
            y.append(y_data[i])
            if i%1000 == 0: print(i, end=" ")
            Class0_max_count -= 1 
    else:
        X.append(x_data[i])
        y.append(y_data[i])

np_y = np.array(y)
print("class 0:", len(np_y[np_y==0]))
print("class 1:", len(np_y[np_y==1]))

0 2000 3000 4000 5000 6000 7000 8000 9000 11000 12000 13000 14000 15000 16000 17000 18000 19000 20000 21000 22000 23000 class 0: 20000
class 1: 9114


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.6 , random_state= 42)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5 , random_state= 42)

SVM trainig & Hyperparameter Tuning

In [5]:
parameters = {'C':[0.001,0.01,0.1, 1]}
linearSVM = SVC(kernel="linear")
linearClassifier = GridSearchCV(linearSVM, parameters, scoring = ['f1_micro','accuracy'], cv=5, refit='f1_micro' , verbose = 10)
linearClassifier.fit(X_train, y_train)

bestLinearClassifier = linearClassifier.best_estimator_

print("For Linear SVM:")
print(f"Best Estimator: {linearClassifier.best_estimator_}")
print(f"F-measure = {linearClassifier.best_score_}")
print(f"Accuracy = {max(linearClassifier.cv_results_['mean_test_f1_micro'])}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5; 1/4] START C=0.001.....................................................
[CV 1/5; 1/4] END C=0.001; accuracy: (test=0.685) f1_micro: (test=0.685) total time=15.6min
[CV 2/5; 1/4] START C=0.001.....................................................
[CV 2/5; 1/4] END C=0.001; accuracy: (test=0.685) f1_micro: (test=0.685) total time=34.4min
[CV 3/5; 1/4] START C=0.001.....................................................
[CV 3/5; 1/4] END C=0.001; accuracy: (test=0.685) f1_micro: (test=0.685) total time=14.5min
[CV 4/5; 1/4] START C=0.001.....................................................
[CV 4/5; 1/4] END C=0.001; accuracy: (test=0.685) f1_micro: (test=0.685) total time=15.1min
[CV 5/5; 1/4] START C=0.001.....................................................
[CV 5/5; 1/4] END C=0.001; accuracy: (test=0.685) f1_micro: (test=0.685) total time=16.1min
[CV 1/5; 2/4] START C=0.01.................................................

Testing

In [6]:
y_pred = bestLinearClassifier.predict(X_test)

print(sklearn.metrics.classification_report(y_test,y_pred))
print("Accuracy: ", sklearn.metrics.accuracy_score(y_test, y_pred))
print(f"f-measure = {sklearn.metrics.f1_score(y_test, y_pred, average='micro')}")

              precision    recall  f1-score   support

           0       0.82      0.94      0.87      3993
           1       0.80      0.55      0.65      1830

    accuracy                           0.81      5823
   macro avg       0.81      0.74      0.76      5823
weighted avg       0.81      0.81      0.80      5823

Accuracy:  0.8140133951571354
f-measure = 0.8140133951571356


Saving the model

In [7]:
def getSavedModelPath(dirname,filename):
  return os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir)),"savedModels", dirname,filename)

with open(getSavedModelPath("posts","SVM.pkl"), 'wb') as f:
    pickle.dump(bestLinearClassifier,f)

Learning Curve

In [8]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

train_size, train_score, test_scores = learning_curve(bestLinearClassifier, X_train, y_train, cv = 10, scoring='accuracy', n_jobs=1, train_sizes=np.linspace(0.01, 1, 50), verbose=1)
mean_train = np.mean(train_score , axis = 1)
mean_test = np.mean(test_scores , axis = 1)
plt.plot(train_size, 1-mean_train, label='Training score')
plt.plot(train_size, 1-mean_test, label='Cross-validation score')
plt.xlabel('Training Size')
plt.ylabel('Score')
plt.title('Learning Curve')
plt.legend()
plt.show()

[learning_curve] Training set sizes: [  157   474   792  1110  1427  1745  2062  2380  2698  3015  3333  3651
  3968  4286  4604  4921  5239  5556  5874  6192  6509  6827  7145  7462
  7780  8097  8415  8733  9050  9368  9686 10003 10321 10638 10956 11274
 11591 11909 12227 12544 12862 13179 13497 13815 14132 14450 14768 15085
 15403 15721]


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
