# Download Data

In [3]:
!pip install kaggle



In [4]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"mannokat","key":"e8b5b50982a37478ca3aec18d281fed5"}'}

In [5]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

!chmod 600 /root/.kaggle/kaggle.json

In [6]:
!kaggle datasets download -d crawford/emnist


Downloading emnist.zip to /content
 99% 1.23G/1.24G [00:14<00:00, 86.4MB/s]
100% 1.24G/1.24G [00:14<00:00, 90.6MB/s]


In [7]:
from zipfile import ZipFile
file_name = "emnist.zip"
with ZipFile(file_name,'r') as zip:
  zip.extractall()
  print('Done')

Done


# Libraries

In [8]:
import numpy as np
import pandas as pd

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory



In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import validation_curve
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Dataset

In [10]:
train_data = pd.read_csv("/content/emnist-letters-train.csv", header = None)
test_data = pd.read_csv("/content/emnist-letters-test.csv", header = None)
mapping = pd.read_csv("/content/emnist-letters-mapping.txt", sep = ' ', header = None)

# Model

In [11]:
train_data.rename(columns= {0: 'label'}, inplace = True)
test_data.rename(columns= {0: 'label'}, inplace = True)
train_data.columns

Index(['label',       1,       2,       3,       4,       5,       6,       7,
             8,       9,
       ...
           775,     776,     777,     778,     779,     780,     781,     782,
           783,     784],
      dtype='object', length=785)

In [12]:
train_data['label'].unique()

array([23,  7, 16, 15, 17, 13, 11, 22, 24, 10, 14, 18, 21, 26, 19,  5,  2,
       25,  9, 12,  1,  8,  4,  3, 20,  6])

In [13]:
map_dict = {}
asc_code = mapping[1].values
for i, v in enumerate(asc_code):
    map_dict[i] = chr(v)

map_dict

{0: 'A',
 1: 'B',
 2: 'C',
 3: 'D',
 4: 'E',
 5: 'F',
 6: 'G',
 7: 'H',
 8: 'I',
 9: 'J',
 10: 'K',
 11: 'L',
 12: 'M',
 13: 'N',
 14: 'O',
 15: 'P',
 16: 'Q',
 17: 'R',
 18: 'S',
 19: 'T',
 20: 'U',
 21: 'V',
 22: 'W',
 23: 'X',
 24: 'Y',
 25: 'Z'}

In [14]:
train_data

Unnamed: 0,label,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,16,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88795,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
88796,21,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
88797,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
88798,23,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
acceptable_range = np.arange(0, 36)
train_data = train_data[train_data["label"].isin(acceptable_range)]
test_data = test_data[test_data["label"].isin(acceptable_range)]
np.sort(train_data.label.unique())

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26])

In [16]:
train_data.isnull().sum().head(10)

label    0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
dtype: int64

In [17]:
train_data.reset_index(inplace = True)
train_data

Unnamed: 0,index,label,1,2,3,4,5,6,7,8,...,775,776,777,778,779,780,781,782,783,784
0,0,23,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,16,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,15,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,23,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88795,88795,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
88796,88796,21,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
88797,88797,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
88798,88798,23,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
test_data.describe()

Unnamed: 0,label,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
count,14800.0,14800.0,14800.0,14800.0,14800.0,14800.0,14800.0,14800.0,14800.0,14800.0,...,14800.0,14800.0,14800.0,14800.0,14800.0,14800.0,14800.0,14800.0,14800.0,14800.0
mean,9.756757,0.0,0.0,0.0,0.0,0.000878,0.024122,0.044662,0.018716,0.020676,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,5.344566,0.0,0.0,0.0,0.0,0.106859,1.413554,2.611465,1.1004,1.238657,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,19.0,0.0,0.0,0.0,0.0,13.0,108.0,208.0,106.0,126.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
print(train_data.columns)
print(test_data.columns)

Index(['index', 'label',       1,       2,       3,       4,       5,       6,
             7,       8,
       ...
           775,     776,     777,     778,     779,     780,     781,     782,
           783,     784],
      dtype='object', length=786)
Index(['label',       1,       2,       3,       4,       5,       6,       7,
             8,       9,
       ...
           775,     776,     777,     778,     779,     780,     781,     782,
           783,     784],
      dtype='object', length=785)


In [20]:
round(train_data.drop('label', axis=1).mean(), 2)

index    44399.5
1            0.0
2            0.0
3            0.0
4            0.0
          ...   
780          0.0
781          0.0
782          0.0
783          0.0
784          0.0
Length: 785, dtype: float64

In [21]:
y = train_data['label']

## Dropping the variable 'label' from X variable
X = train_data.drop(columns = 'label')

## Printing the size of data
print(train_data.shape)

(88800, 786)


In [22]:
X = X/255.0
test_data = test_data/255.0

print("X:", X.shape)
print("test_data:", test_data.shape)

X: (88800, 785)
test_data: (14800, 785)


In [25]:
from sklearn.preprocessing import scale
X_scaled = scale(X)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, train_size = 0.8 ,random_state = 42)

# Summary Graph & Accuracy

In [22]:
model_linear = SVC(kernel='linear')
model_linear.fit(X_train, y_train)

y_pred = model_linear.predict(X_test)

In [31]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [23]:
# confusion matrix and accuracy


# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

# cm
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))

accuracy: 0.7149211711711712 

[[447   9   3  18  20   5  27  27   0   1  12   0  11  27  16   5  22  11
    3   4  12   1   7   1   2   2]
 [ 19 520   1  22   9   1  21  23   3   7   6   6   4   2   9   2  10   1
    3   3   1   0   3   3   1   9]
 [  5   0 611   1  44   6  14   1   1   2   2   8   1   3  11   2   9   2
    1   2   6   1   0   1   1   2]
 [ 13  53   3 426   3   2   9  16   3  31   7   6   0   2  36   7   1   0
    2   1   8   2   5   3   1   5]
 [ 16   6  86   2 475   4  10   0   1   0  14   1   0   1   2   2   2  23
    3   2   4   3   1   2   1   1]
 [  7   2   2   1   6 533   7   3  11  10   5   4   0   1   0  53  10  21
    1  22   0   1   1   0   4   2]
 [ 38  46  12   8   9  11 359   3   1  14   2   0   7   3   5   7  98   5
   21   7   4   2   5   4   4   4]
 [ 19  17   1  14   1   2   2 472  16   4  32  17   6  19   0   0   0   1
    0   4   3   1   2   3   3   0]
 [  1   2   0   1   0   2   5   0 465  25   0 160   0   0   0   1   5   3
    1   6   0   2   0  

In [24]:
# model
non_linear_model = SVC(kernel='rbf')

# fit
non_linear_model.fit(X_train, y_train)

# predict
y_pred = non_linear_model.predict(X_test)

# confusion matrix and accuracy

# accuracy
print("accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=y_pred), "\n")

# cm
print(metrics.confusion_matrix(y_true=y_test, y_pred=y_pred))

accuracy: 0.8404842342342342 

[[562   3   0   6   6   4   9   9   0   0   9   0   6  14  12   7  18   7
    0   1   4   0  13   0   1   2]
 [  7 593   0  12   5   0   7  12   1   1   1   3   0   4   8   4   8   2
    0   0   0   0  14   0   1   6]
 [  6   1 666   1  19   6   3   0   2   1   2   5   0   0   7   0   7   5
    1   0   1   0   3   1   0   0]
 [  5  14   0 535   1   0   1   3   1   8   4   7   0   2  35   3   2   0
    1   1   2   1  17   0   1   1]
 [  9   4  35   1 589   4   1   0   0   1   3   1   1   0   0   0   3   4
    0   2   0   0   2   0   0   2]
 [  3   1   0   1   1 593   2   2   5   4   0   5   0   0   0  35   8  11
    0  13   0   0  14   1   7   1]
 [ 26  20   4   4   4   1 441   1   0   5   1   0   2   1   4   2 118   2
   11   4   3   2  13   0   4   6]
 [  4  10   0   8   0   1   0 537   1   0  19  17   3  17   0   0   2   1
    0   3   2   1   9   2   2   0]
 [  1   0   0   1   0   4   2   0 482  17   0 165   0   0   0   0   0   3
    0   4   0   0   4  

In [None]:
# model with optimal hyperparameters

# model
model = SVC(C=10, gamma=0.001, kernel="rbf")

model.fit(X_train, y_train)
y_pred = model.predict(X_test)



In [32]:
# metrics
print("accuracy", metrics.accuracy_score(y_test, y_pred), "\n")
print(metrics.confusion_matrix(y_test, y_pred), "\n")

accuracy 0.893186936936937 

[[612   2   1   4   4   2  10   6   0   1   2   0   6   9   3   2  12   1
    0   3   2   0   9   0   2   2]
 [  5 576   1   7   1   1  13   5   0   3   1   1   3   1   2   2   1   1
    0   0   2   0  10   0   0   1]
 [  3   1 640   1  14   0   3   0   1   0   0   3   0   1   6   2   1   4
    1   0   1   0   5   2   0   0]
 [  5  13   0 608   0   0   2   2   0   7   0   3   0   3  17   4   3   1
    0   1   2   2   7   1   2   0]
 [  4   3  12   0 633   5   1   0   0   0   0   2   0   0   1   2   0   4
    0   1   0   0   1   0   0   7]
 [  2   2   0   1   0 613   3   2   1   1   1   0   0   2   0  17   3   3
    1  18   0   0   6   0   0   0]
 [ 15  14   1   1   2   3 515   1   0   3   2   0   2   0   3   3  82   0
    3   1   1   2   8   0   3   4]
 [  6   5   0   2   0   1   1 619   2   0   7   7   7  15   0   0   0   0
    0   1   4   1   3   1   1   0]
 [  0   0   0   1   1   1   1   2 516  14   1 132   0   0   0   0   2   0
    0   2   0   0   2   0