In [43]:
# libraries
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import scale

letters = pd.read_csv("letter-recognition.csv")

In [44]:
print("Dimensions: ", letters.shape, "\n")
print(letters.info())
letters.head()

Dimensions:  (20000, 17) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   letter  20000 non-null  object
 1   xbox    20000 non-null  int64 
 2   ybox    20000 non-null  int64 
 3   width   20000 non-null  int64 
 4   height  20000 non-null  int64 
 5   onpix   20000 non-null  int64 
 6   xbar    20000 non-null  int64 
 7   ybar    20000 non-null  int64 
 8   x2bar   20000 non-null  int64 
 9   y2bar   20000 non-null  int64 
 10  xybar   20000 non-null  int64 
 11  x2ybar  20000 non-null  int64 
 12  xy2bar  20000 non-null  int64 
 13  xedge   20000 non-null  int64 
 14  xedgey  20000 non-null  int64 
 15  yedge   20000 non-null  int64 
 16  yedgex  20000 non-null  int64 
dtypes: int64(16), object(1)
memory usage: 2.6+ MB
None


Unnamed: 0,letter,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [45]:
letters.columns = ['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar', 'ybar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge', 'xedgey', 'yedge', 'yedgex']
print(letters.columns)

Index(['letter', 'xbox', 'ybox', 'width', 'height', 'onpix', 'xbar', 'ybar',
       'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar', 'xedge', 'xedgey',
       'yedge', 'yedgex'],
      dtype='object')


In [46]:
order = list(np.sort(letters['letter'].unique()))
print(order)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [47]:
letter_means = letters.groupby('letter').mean()
letter_means.head()

Unnamed: 0_level_0,xbox,ybox,width,height,onpix,xbar,ybar,x2bar,y2bar,xybar,x2ybar,xy2bar,xedge,xedgey,yedge,yedgex
letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
A,3.337136,6.975919,5.12801,5.178707,2.991128,8.851711,3.631179,2.755387,2.043093,7.802281,2.338403,8.465146,2.771863,6.321926,2.875792,7.468948
B,3.98564,6.962141,5.088773,5.169713,4.596606,7.671018,7.062663,5.366841,5.571802,7.954308,5.506527,6.652742,3.117493,7.91906,6.612272,9.100522
C,4.03125,7.063859,4.701087,5.296196,2.775815,5.4375,7.627717,5.927989,7.177989,8.773098,7.494565,11.947011,1.991848,8.876359,4.080163,8.555707
D,4.023602,7.24472,5.170186,5.288199,4.026087,7.53913,6.806211,5.921739,6.508075,8.16646,5.111801,5.750311,3.365217,7.813665,3.971429,7.628571
E,3.727865,6.94401,4.75651,5.201823,3.679688,5.966146,7.352865,4.223958,7.585938,8.507812,6.242188,10.341146,2.127604,8.298177,6.022135,8.50651


In [48]:
round(letters.drop('letter', axis=1).mean(), 2)

xbox      4.02
ybox      7.04
width     5.12
height    5.37
onpix     3.51
xbar      6.90
ybar      7.50
x2bar     4.63
y2bar     5.18
xybar     8.28
x2ybar    6.45
xy2bar    7.93
xedge     3.05
xedgey    8.34
yedge     3.69
yedgex    7.80
dtype: float64

In [49]:
X = letters.drop("letter", axis = 1)
y = letters['letter']

In [50]:
X_scaled = scale(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.3, random_state = 101)

In [51]:
print(y_train)

10593    H
18903    N
6491     X
17136    V
19651    Z
        ..
5695     K
8006     P
17745    P
17931    U
13151    M
Name: letter, Length: 14000, dtype: object


In [52]:
import random
print(type(X_train))
print((y_train.to_numpy()))

<class 'numpy.ndarray'>
['H' 'N' 'X' ... 'P' 'U' 'M']


In [53]:
import math
def markov_samp(X_train, Y_train, k = 5, q = 1.2):

    # Init_sampl = np.concatenate((X_train, np.array()), axis = 0)
    Init_sampl = np.concatenate((np.vstack(X_train), np.vstack(Y_train.to_numpy())), axis = 1)
    Dtr = random.sample(list(Init_sampl), 8000)

    m = len(Dtr)
    print("M : " , m)
    # mplus =np.count_nonzero(np.array(Dtr[16]) == 'A')
    # mplus = Dtr[16].count('A')
    mneg = 0
    mplus = 0

    index = np.random.choice(len(Dtr), 1, replace=False)  
    Dtra = np.array(Dtr)
    print(type(Dtra))
    zt = Dtra[index][0]

    print("zt : ", zt)
    if m%2 == 0:
        if zt[16] == 'A':
            mplus += 1;
        else:
            mneg += 1
    samp = []

    model_linear = SVC(kernel='linear')#
    model_linear.fit(X_train, Y_train)#
    
    yzt = model_linear.predict(np.array(zt)[0:16].reshape(1, -1))
    fxy1 = 1
    if not yzt==zt[16]:
        fxy1 = 2 
    lzt = math.exp(0-fxy1)

    Pd = 0
    Pdd = 0
    
    while(mplus + mneg < m ):
        zstar = Dtra[np.random.choice(len(Dtr), 1, replace=False)][0]
        ystar = model_linear.predict(np.array(zstar)[ 0:16].reshape(1, -1)) #
        fxy = 1
        if not ystar==zstar[16]:
            fxy = 2 
        lzstar = math.exp(0-fxy)
        P = math.exp(lzt-lzstar)
        # print(zt)
        yt = zt[16]
        zt = zstar

        if P == 1:
            if zt[16] == yt:
                r = random.uniform(0.001, 1.0)
                if r <= P:
                    samp.append(zstar) #Pdash = e−y∗f0 /e−ytf0
            else:
                ct = 1 if yt=='A' else -1
                cst = 1 if ystar=='A' else -1
                Pd = math.exp(ct*fxy1-cst*fxy)
                r = random.uniform(0.001, 1.0)
                if r <= Pd:
                    samp.append(zstar) #P

        if len(samp) == k:
            Pdd = q*P
            samp.append(zstar) #Pdd
            
        ztp1 = zstar
        if yt == 'A':
            mplus += 1
        else:
            mneg += 1

        if P > 1 or Pd > 1 or Pdd > 1:
            samp.append(zstar)
    return samp


In [54]:
nsamp = np.array(markov_samp(X_train, y_train))
print(nsamp.shape)
X_train = nsamp[:, 0:16]
y_train = nsamp[:, 16]
print(X_train.shape)
print(y_train.shape)

M :  8000
<class 'numpy.ndarray'>
zt :  ['-1.0576982957256178' '-0.91860344630199' '-0.556881230443966'
 '-1.0491374149880448' '-0.6874761975053775' '0.5441304509324318'
 '-0.21521994542421233' '-0.23282340548628083' '0.34499438865456755'
 '0.6903798536167043' '-0.5526406630296021' '-0.9271511127708227'
 '-0.4484921035696958' '-0.21908162939303566' '-0.26947711190847695'
 '0.12291106712560491' 'D']
(14656, 17)
(14656, 16)
(14656,)


In [55]:
# print(y_train)
Y_train = []
for i in y_train:
    # print(i)
    Y_train.append(ord(i))
# print(Y_train)

In [56]:
print(X_test.shape)

(6000, 16)


In [57]:
# linear model

model_linear = SVC(kernel='linear')
model_linear.fit(X_train, Y_train)

# predict
y_pred = model_linear.predict(X_test)


In [58]:
# print(y_pred)
Y_test = []
for i in y_test:
    Y_test.append(ord(i))
# print(Y_test)

In [59]:
# accuracy
print("Accuracy linear kernel :", metrics.accuracy_score(y_true=Y_test, y_pred=y_pred), "\n")
print(metrics.confusion_matrix(y_true=Y_test, y_pred=y_pred))

Accuracy linear kernel : 0.8383333333333334 

[[197   0   0   0   0   0   0   0   0   1   1   1   0   0   1   0   0   2
    0   2   1   0   0   0   3   0]
 [  0 185   0   4   0   1   4   0   0   0   0   1   0   1   0   0   0  18
    2   0   0   1   0   1   0   0]
 [  5   0 188   0  13   0  10   3   0   0   5   1   0   0   3   0   0   0
    0   0   2   0   0   0   0   0]
 [  1  12   0 209   0   0   2   1   0   2   1   0   0   5   3   0   2   7
    0   1   2   0   0   0   0   0]
 [  0   0   3   0 200   1  12   0   0   0   1   6   0   0   0   0   0   1
    0   4   0   0   0   2   0   4]
 [  0   3   0   2   0 193   1   1   2   0   0   0   0   2   0   6   0   0
    4   5   1   0   1   0   4   0]
 [  0   0  13   5   8   2 168   0   0   0   4   2   1   0   3   0   7   0
    4   0   0   1   1   0   0   0]
 [  0   4   1   8   0   2   4 143   0   0   6   1   1   1   8   0   6  18
    0   2   6   5   0   0   1   0]
 [  0   0   1   3   0   8   1   0 180   5   0   1   0   0   1   0   1   0
    8   

In [60]:
non_linear_model = SVC(kernel='rbf')
non_linear_model.fit(X_train, Y_train)
y_pred = non_linear_model.predict(X_test)

In [61]:
print("Accuracy rbf kernel :", metrics.accuracy_score(y_true=Y_test, y_pred=y_pred), "\n")
print(metrics.confusion_matrix(y_true=Y_test, y_pred=y_pred))

Accuracy rbf kernel : 0.9111666666666667 

[[204   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   1
    0   0   0   0   0   0   3   0]
 [  0 202   0   7   1   0   1   0   0   0   0   0   0   0   0   0   0   6
    0   0   0   0   0   1   0   0]
 [  0   0 209   0   9   0   5   1   0   0   0   0   0   0   5   0   0   0
    0   0   1   0   0   0   0   0]
 [  0   3   0 227   0   0   1   2   0   2   0   0   1   4   1   0   0   5
    1   0   0   0   0   0   0   1]
 [  0   0   0   0 215   1  10   0   0   0   1   0   0   0   0   0   0   1
    0   1   0   0   0   2   0   3]
 [  0   1   0   1   0 208   1   0   1   0   0   0   0   1   0   4   0   0
    2   5   0   0   1   0   0   0]
 [  0   0   3   5   1   1 199   0   0   0   2   1   1   0   3   0   0   1
    0   0   0   0   2   0   0   0]
 [  0   7   1  10   0   1   4 163   0   0   4   0   2   0   5   0   5  13
    0   0   1   0   1   0   0   0]
 [  0   0   2   2   0   8   0   0 187   6   1   0   0   0   0   1   1   0
    3   0  

In [62]:
non_linear_model = SVC(kernel='poly')
non_linear_model.fit(X_train, Y_train)
y_pred = non_linear_model.predict(X_test)

In [63]:
print("Accuracy polynomial kernel:", metrics.accuracy_score(y_true=Y_test, y_pred=y_pred), "\n")
print(metrics.confusion_matrix(y_true=Y_test, y_pred=y_pred))

Accuracy polynomial kernel: 0.8526666666666667 

[[195   0   0   0   0   0   0   1   0   0   0   1   1   2   3   0   0   3
    0   0   1   0   0   1   1   0]
 [  0 186   0   4   1   0   1   2   0   0   0   0   0   0  19   0   0   3
    0   0   0   0   0   1   1   0]
 [  0   0 192   0  12   0   4   0   0   0   0   0   0   0  19   0   0   0
    0   0   0   0   0   3   0   0]
 [  1   5   0 213   0   0   0   1   0   1   0   0   0   4  15   0   0   7
    1   0   0   0   0   0   0   0]
 [  0   0   0   0 199   0   9   0   0   0   0   0   0   0   7   0   1   0
    0   1   0   0   0  15   0   2]
 [  0   1   0   3   0 200   0   2   0   0   0   0   1   1   4   3   0   0
    2   3   0   0   0   5   0   0]
 [  0   0   3   4   1   2 172   0   0   0   2   1   1   0  30   0   1   0
    0   0   0   0   2   0   0   0]
 [  0   3   1   8   1   1   3 119   0   1   4   1   1   0  57   1   2   8
    0   0   0   0   1   5   0   0]
 [  0   0   0   2   0   7   1   0 183   5   1   0   0   0   4   1   0   0
    6