In [4]:
##imports
import joblib

import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer


In [5]:
##Needed CSV's
trainingData = pd.read_csv("train.csv", index_col=0) 
testingData  = pd.read_csv("test.csv", index_col=0)  

In [6]:
##batch size (started with a small no. to train the algorithm on)
batch_size = 320000

In [7]:
##vectorizing the data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(trainingData[:batch_size]['OCR text'])

In [8]:
y = trainingData[:batch_size]["Label"]

In [9]:
print(X)

  (0, 1023804)	0.17356085231708732
  (0, 3902906)	0.11006352215199676
  (0, 1994521)	0.17368667202763452
  (0, 1105205)	0.10576162854433116
  (0, 486789)	0.2621773100694327
  (0, 613354)	0.2228816039914777
  (0, 615186)	0.20749094914032148
  (0, 3269928)	0.12909278246115696
  (0, 1565801)	0.10914941928698993
  (0, 2774797)	0.0990143902998192
  (0, 3522610)	0.10883672516232283
  (0, 2869639)	0.09994235132014762
  (0, 3269883)	0.22967632373744243
  (0, 1154145)	0.1358293341451583
  (0, 640366)	0.16160764306247738
  (0, 3387834)	0.07002938197734061
  (0, 1271309)	0.08842233560437013
  (0, 1111183)	0.11582977767520121
  (0, 3280571)	0.24496235065522273
  (0, 346899)	0.2119889565398721
  (0, 3387681)	0.08155898259578934
  (0, 2918596)	0.21840229149025575
  (0, 785441)	0.2460772720436303
  (0, 362384)	0.22574311148663473
  (0, 2169322)	0.2748617352756527
  :	:
  (319999, 1347754)	0.3541882361429565
  (319999, 2597553)	0.3541882361429565
  (319999, 378591)	0.2831840859748654
  (319999, 230298

In [None]:
print(vectorizer.get_feature_names())

In [None]:
print(X.shape)

In [10]:
##Creating pipelines for SVM

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
svm_pipeline = Pipeline([
    ('svm', SGDClassifier(max_iter=320000))
])

In [None]:
##Listing the testing tuning parameters

import numpy as np
from sklearn.model_selection import GridSearchCV
grid_params = {    
    "svm__loss" : ["hinge", "log", "squared_hinge", "modified_huber"],
    "svm__alpha" : [0.0001, 0.001, 0.01, 0.1],
    "svm__penalty" : ["l2", "l1", "none"],
}
clf = GridSearchCV(svm_pipeline, grid_params,verbose=10)
clf.fit(X, y)
print("Best Score: ", clf.best_score_)
print("Best Params: ", clf.best_params_)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] svm__alpha=0.0001, svm__loss=hinge, svm__penalty=l2 .............
[CV]  svm__alpha=0.0001, svm__loss=hinge, svm__penalty=l2, score=0.723, total=  24.5s
[CV] svm__alpha=0.0001, svm__loss=hinge, svm__penalty=l2 .............
[CV]  svm__alpha=0.0001, svm__loss=hinge, svm__penalty=l2, score=0.724, total=  24.0s
[CV] svm__alpha=0.0001, svm__loss=hinge, svm__penalty=l2 .............
[CV]  svm__alpha=0.0001, svm__loss=hinge, svm__penalty=l2, score=0.725, total=  23.8s
[CV] svm__alpha=0.0001, svm__loss=hinge, svm__penalty=l2 .............
[CV]  svm__alpha=0.0001, svm__loss=hinge, svm__penalty=l2, score=0.725, total=  24.3s
[CV] svm__alpha=0.0001, svm__loss=hinge, svm__penalty=l2 .............
[CV]  svm__alpha=0.0001, svm__loss=hinge, svm__penalty=l2, score=0.726, total=  23.5s
[CV] svm__alpha=0.0001, svm__loss=hinge, svm__penalty=l1 .............
[CV]  svm__alpha=0.0001, svm__loss=hinge, svm__penalty=l1, score=0.557, total=  47

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   24.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   48.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.0min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  2.8min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  3.6min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  4.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  5.2min remaining:    0.0s


In [None]:
##Fiting the data and training it - modify tuning parameters here based on the GridsearchCV
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(alpha=0.0001, loss='hinge', penalty='none')

clf.fit(X, y)
joblib.dump(clf, 'svm.joblib')

In [None]:
##Testing data based on the fitted model

from tqdm import tqdm

#Prediction and accuracy
from sklearn.metrics import accuracy_score
documents1 = testingData[:]["OCR text"]
y_true = testingData[:]['Label']
input = vectorizer.transform(documents1)
prediction1= clf.predict(input)

dataframe = pd.DataFrame(list(y_true),prediction1)
index = 0
count = 0
for i in tqdm(list(y_true)):
    if i == prediction1[index]:
        count += 1
    index+=1
    
    
print("Accuracy:",accuracy_score(list(y_true),prediction1))
print(dataframe)