# PERFORMING OCR WITH SVMs

Rosa Karina Torres Calderon

In [4]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC 
from sklearn import metrics

Functions

In [12]:
def measure_performance(X,y,clf, show_accuracy=True,
show_classification_report=True, show_confusion_matrix=True):
    y_pred=clf.predict(X)
    if show_accuracy:
        print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y,y_pred),"\n")
    if show_confusion_matrix:
        print('Matriz de confusión')
        print('')
        print(pd.crosstab(y, y_pred, rownames = ['True'], colnames = ['Predicted'], margins = True))

In [16]:
def count_values_classification(y_pred,y,size):
    true_values = 0
    false_values = 0
    for i in range(0,size):
        if y.iloc[i] == y_pred[i]:
          true_values  = true_values+1
        else:
            false_values = false_values+1 
        #print('Dato orginal: ' + y.iloc[i] + ' Dato predecido: ' + str(y_pred[i]))
    val = [true_values, false_values]
    return val

# Step 1- Reading the data set

In [5]:
letterdata = pd.read_csv("letterdata.csv")
print(letterdata.head(5))
print(" ")
print('Tamaño del data frame: ' + str(letterdata.shape))

  letter  xbox  ybox  width  height  onpix  xbar  ybar  x2bar  y2bar  xybar  \
0      T     2     8      3       5      1     8    13      0      6      6   
1      I     5    12      3       7      2    10     5      5      4     13   
2      D     4    11      6       8      6    10     6      2      6     10   
3      N     7    11      6       6      3     5     9      4      6      4   
4      G     2     1      3       1      1     8     6      6      6      6   

   x2ybar  xy2bar  xedge  xedgey  yedge  yedgex  
0      10       8      0       8      0       8  
1       3       9      2       8      4      10  
2       3       7      3       7      3       9  
3       4      10      6      10      2       8  
4       5       9      1       7      5      10  
 
Tamaño del data frame: (20000, 17)


# Step 2- Exploring and preparing the data

In [6]:
print(letterdata.dtypes)

letter    object
xbox       int64
ybox       int64
width      int64
height     int64
onpix      int64
xbar       int64
ybar       int64
x2bar      int64
y2bar      int64
xybar      int64
x2ybar     int64
xy2bar     int64
xedge      int64
xedgey     int64
yedge      int64
yedgex     int64
dtype: object


In [15]:
print('Número de letras: ' + str(letterdata.letter.value_counts().shape))

Número de letras: (26,)


In [7]:
# Split data set in target and the rest of the features 
letterdata_target = letterdata.iloc[:,0]
letterdata_train = letterdata.iloc[:,1:]

In [8]:
# Split the data set in train ant test sets
X_train = letterdata_train.iloc[:16000,:]
X_test = letterdata_train.iloc[16000:,:]
y_train = letterdata_target.iloc[:16000]
y_test = letterdata_target.iloc[16000:]

# Step 3- Training a Model on the data

In [9]:
# Training the model with linear kernel
model = SVC(kernel='linear')
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

# Step 4- Evaluating model performance

In [13]:
measure_performance(X_train,y_train,model,show_accuracy=True,show_classification_report=True, show_confusion_matrix=True)

Accuracy:0.875 

Classification report
              precision    recall  f1-score   support

           A       0.94      0.97      0.96       633
           B       0.81      0.89      0.85       630
           C       0.89      0.90      0.89       594
           D       0.81      0.91      0.86       638
           E       0.84      0.86      0.85       616
           F       0.84      0.90      0.87       622
           G       0.75      0.79      0.77       609
           H       0.71      0.69      0.70       583
           I       0.91      0.88      0.90       590
           J       0.90      0.89      0.89       599
           K       0.83      0.86      0.85       593
           L       0.93      0.89      0.91       604
           M       0.94      0.94      0.94       648
           N       0.96      0.93      0.94       617
           O       0.88      0.79      0.84       614
           P       0.95      0.88      0.91       635
           Q       0.88      0.82      0.8

In [15]:
y_pred=model.predict(X_test)
values = count_values_classification(y_pred,y_test,len(y_pred))
print('True values: ' + str(values[0]))
print('False values: ' + str(values[1]))

True values: 3356
False values: 644


# Step 5- Improving model performance

In [17]:
# Training the model with rbf kernel
model = SVC(kernel='rbf')
model.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [18]:
measure_performance(X_train,y_train,model,show_accuracy=True,show_classification_report=True, show_confusion_matrix=True)

Accuracy:0.994 

Classification report
              precision    recall  f1-score   support

           A       1.00      1.00      1.00       633
           B       0.98      0.99      0.99       630
           C       1.00      0.99      1.00       594
           D       0.99      1.00      0.99       638
           E       0.99      1.00      0.99       616
           F       0.99      1.00      0.99       622
           G       0.99      0.99      0.99       609
           H       0.99      0.98      0.99       583
           I       1.00      0.97      0.98       590
           J       0.97      0.99      0.98       599
           K       1.00      0.99      0.99       593
           L       1.00      1.00      1.00       604
           M       1.00      1.00      1.00       648
           N       1.00      0.99      1.00       617
           O       0.99      1.00      1.00       614
           P       1.00      0.99      0.99       635
           Q       0.99      1.00      1.0

In [19]:
y_pred=model.predict(X_test)
values = count_values_classification(y_pred,y_test,len(y_pred))
print('True values: ' + str(values[0]))
print('False values: ' + str(values[1]))

True values: 3889
False values: 111
