In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the python Docker image: https://github.com/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import os 
import cv2
import random

In [None]:
train_path = '/input/digit-recognizer/train.csv'
test_path = '/input/digit-recognizer/test.csv'
submission_path = '/input/digit-recognizer/sample_submission.csv'

In [None]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
submissions = pd.read_csv(submission_path)

In [None]:
train_df.head()

## Given data is in pandas dataframe format let's create a function that reads dataframe and plot random images from dataset.

In [None]:
def show_sample_images(dataframe):
    dataframe = dataframe.copy()
    label_df = []
    random_indexes = random.sample(range(test_df.shape[0]), 9)
    if 'label' in dataframe.columns:
        label_df = dataframe['label']
        dataframe.drop('label',axis=1,inplace=True)
        
    for i in range(9):
        img = dataframe.loc[random_indexes[i]].to_numpy().reshape(28, 28)
        if len(label_df)!=0:
            label = label_df.loc[random_indexes[i]]
        plt.subplot(3, 3, i+1)
        plt.grid(False);
        plt.axis(False);
        plt.imshow(img);
        if(len(label_df) != 0):
            plt.title(f'Actual Class: {label}');


In [None]:
show_sample_images(train_df)

In [None]:
show_sample_images(test_df)

In [None]:
# Split data into train test set using sklearn's train_test_split 
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(train_df.drop('label',axis=1),
                                                 train_df['label'], 
                                                 test_size=0.15, 
                                                 shuffle=True)
len(X_train),len(X_test),len(y_train),len(y_test)

# Let's create baseline model using Support Vector Classifier


In [None]:
from sklearn.svm import SVC
model = SVC()
model.fit(X_train,y_train)

In [None]:
# Evaluate accuracy score
from sklearn.metrics import accuracy_score
preds = model.predict(X_test)
svc_score = accuracy_score(y_test,preds)
svc_score

Hmm 97.5% accuracy on baseline model, great !! 

# Let's try another methods that can improve training time and accuracy 

In [None]:
from sklearn.decomposition import PCA

X_train_decom = X_train.copy()
X_test_decom = X_test.copy()

pca = PCA(n_components=0.95) #--> Here we are selection n_components as 0.95 which assures that we preserve 95% variance in data
X_train_decom = pca.fit_transform(X_train_decom)
X_test_decom  = pca.transform(X_test_decom)

In [None]:
X_train_decom.shape

# Yey!! we managed to decrease number of features to 154 let's train model in new dataset

In [None]:
model_2 = SVC()
model_2.fit(X_train_decom, y_train)

In [None]:
model_2_preds = model_2.predict(X_test_decom)
model_2_score = accuracy_score(y_test, model_2_preds)
model_2_score

## Wow we managed to get accuracy of 97.9% with only 154 features in 1/3rd training time 

# Lets try one more model where we perform preprocessing with unsupervised technique i.e. KMeans 

In [None]:
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
X_train_KMeans = X_train.copy()
X_test_KMeans = X_test.copy()

In [None]:
pipeline = Pipeline(steps = [
    ('KMeans',KMeans(n_clusters=99)), 
    ('SVC_clf',SVC())
])
pipeline.fit(X_train_KMeans,y_train)

In [None]:
pipeline_preds = pipeline.predict(X_test)
pipeline_score = accuracy_score(y_test,pipeline_preds)
pipeline_score

In [None]:
accuracy_df = pd.DataFrame.from_dict({
    'svc_model': [svc_score],
    'PCA_SVC': [model_2_score],
    'KMeans_SVC': [pipeline_score]
})
accuracy_df = accuracy_df.transpose()
accuracy_df = accuracy_df.sort_values(by=0,ascending=False)

In [None]:
accuracy_df.plot(kind='bar',legend=True)

From above graph we can conclude that our PCA model performs best from all 3 models so let's go with PCA predictions for submission

In [None]:
test_df = pca.transform(test_df)

In [None]:
final_preds = model_2.predict(test_df)

In [None]:
final_preds[:10]

In [None]:
submissions['Label'] = final_preds

In [None]:
submissions.to_csv("submission.csv",index=False)