In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from scipy.ndimage.interpolation import shift

#importing Classifiers
from sklearn.svm import SVC

#F Score and other metrics
from sklearn import metrics
from sklearn.metrics import f1_score, confusion_matrix

# Exploring the dataset

In [None]:
digits = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
test_digits = pd.read_csv("/kaggle/input/digit-recognizer/test.csv") 

In [None]:
digits.head()

In [None]:
test_digits.head()

In [None]:
digits.info()

In [None]:
test_digits.info()

In [None]:
#Checking for duplicate rows in the training dataset

duplicate_rows = digits[digits.duplicated()]
print(duplicate_rows)

In [None]:
#Checking for duplicate rows in the test dataset

duplicate_rows = test_digits[test_digits.duplicated()]
print(duplicate_rows)

In [None]:
target_counts = digits['label'].value_counts()
print(target_counts)

# Visualising how images are stored in the dataset

In [None]:
some_digit = digits.values[0:1,1:]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap="binary")
plt.axis("off")
plt.show()
print(f"The first value is {digits.values[0,0]}")

# Splitting the training dataset into Training and Validation set

In [None]:
# Selecting the desired columns for the model
X = digits.drop(columns=['label'])
y = digits['label']

# Split the dataset into a train and validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the train and validation sets
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# Training the model 

In [None]:
svc = SVC(C=4, kernel='rbf')
svc.fit(X_train, y_train)
pred_best = svc.predict(X_val)
accuracy = metrics.accuracy_score(y_val, pred_best)
f_score = f1_score(y_val, pred_best, average = 'micro')
print("Accuracy on test set:", accuracy)
print("F Score on the test set:", f_score)

# Error Analysis ( Confusion matrix )

In [None]:
conf_mx = confusion_matrix(y_val, pred_best)

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
ax.matshow(conf_mx, cmap=plt.cm.gray)

In [None]:
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
np.fill_diagonal(norm_conf_mx, 0)
ax.matshow(norm_conf_mx, cmap=plt.cm.gray)

by looking at the bright spots in the normalised confusion matrix we can identify which values get misclassified the most. For example, 5 and 6 get gets confused by the classifier. Same can be said about 4 and 9. This could be because of various reasons and the lack of training data could be one of them (5 has the least number of labels). Lets create some new features.

# Creating new features 

In [None]:
X_new = np.array(X)
y_new = np.array(y)

def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])

X_augmented = [image for image in X_new]
y_augmented = [label for label in y_new]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_new, y_new):
        X_augmented.append(shift_image(image, dx, dy))
        y_augmented.append(label)

X_augmented = np.array(X_augmented)
y_augmented = np.array(y_augmented)

X = pd.DataFrame(X_augmented)
y = y_augmented

# Splitting the newly created datasets to train 

In [None]:
# Split the dataset into a train and validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the train and validation sets
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

# Fitting the model on the new dataset 

In [None]:
svc = SVC(C=4, kernel='rbf')
svc.fit(X_train, y_train)
pred_best = svc.predict(X_val)

accuracy = metrics.accuracy_score(y_val, pred_best)
f_score = f1_score(y_val, pred_best, average = 'micro')
print("Accuracy on test set:", accuracy)
print("F Score on the test set:", f_score)

In [None]:
y_test = svc.predict(test_digits)
y_test

Ignore the warning message

In [None]:
y_test.shape[0]

In [None]:
output = pd.DataFrame({'ImageId':test_digits.index+1, 'Label':y_test})
output.to_csv('digit_identifier.csv',index=False)
print("Your submission was successfully saved")