<a href="https://colab.research.google.com/github/Mihir-Amit/Digit_Recognizer/blob/main/Digit_Recognizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'digit-recognizer:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F3004%2F861823%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240212%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240212T161532Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2aff9b35713a16d23ad61d875652f543d01abff6fd67b4fbc573b822b1186f439ac3a13ead0379a30529df6af5c33ac27c8f5c83dbc02a6ca7be5e0d16ef69e8b142c1c6e741d5d6fc32144d19d37a9a5e11ed2287345ff91a337ea98135f189940124302d2f6626b6352d8a3d22fa34aac5ced06a0d16bf3ebac182944b795b04edf7bd3472ec884a86c77af46094a3041913718ed1da388623712e5e43af0d64fffa43cedb577fd93ff34c0150f0590fc89f9d4afde2475d3043e2abed41a13dbc34a1479c88618644c72d01934a8aa0c2ad339fbc27f6a23fa0a75b2911b21df88750b22c9511fa945705a2901f8a5be0e2fbf5a94310c2d6755998499fbd'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:

import pandas as pd
from tensorflow.keras.utils import to_categorical

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data= pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test_data= pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

In [None]:
train_data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
test_data.head()

Unnamed: 0,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
train_data.shape

(42000, 785)

In [None]:
test_data.shape

(28000, 784)

In [None]:
train_labels= pd.DataFrame()
train_labels= train_data['label'].values

In [None]:
train_data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
train_data1=pd.DataFrame()
train_data1= train_data.iloc[:, 1:]

In [None]:
train_data1= train_data1.astype('float32')/255
test_data= test_data.astype('float32')/255

In [None]:
train_labels= to_categorical(train_labels)

In [None]:
train_labels[0]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [None]:
train_labels.shape

(42000, 10)

In [None]:
train_data1.shape

(42000, 784)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPooling2D

def create_cnn_model():
    model_cnn= Sequential()
    model_cnn.add(Conv2D(32, kernel_size= (3,3), activation= 'relu', input_shape= (28,28,1)))
    model_cnn.add(MaxPooling2D(pool_size= (2,2)))
    model_cnn.add(Conv2D(64, kernel_size= (3,3), activation= 'relu'))
    model_cnn.add(MaxPooling2D(pool_size= (2,2)))
    model_cnn.add(Flatten())
    model_cnn.add(Dense(384, activation= 'relu'))
    model_cnn.add(Dense(384, activation= 'relu'))
    model_cnn.add(Dense(10, activation = 'softmax'))
    model_cnn.compile(optimizer= 'adam', loss= 'categorical_crossentropy', metrics= ['accuracy'])
    return model_cnn

In [None]:
train_labels = train_data['label'].values

# Remove the label column
train_data = train_data.drop(columns=['label'])

# Reshape the train data
train_data_reshaped = train_data.values.reshape(-1, 28, 28, 1)

# Normalize the pixel values
train_data_reshaped = train_data_reshaped.astype('float32') / 255.0

train_labels = to_categorical(train_labels)

# Check the shape of train_data_reshaped and train_labels
print("Shape of train_data_reshaped:", train_data_reshaped.shape)
print("Shape of train_labels:", train_labels.shape)

Shape of train_data_reshaped: (42000, 28, 28, 1)
Shape of train_labels: (42000, 10)


In [None]:
# Check if training data is empty
if train_data_reshaped.shape[0] == 0:
    print("Training data is empty. Please check data loading and preprocessing.")
else:
    model_cnn = create_cnn_model()
    # Train the model
    history = model_cnn.fit(train_data_reshaped, train_labels, epochs=10, batch_size=128, validation_split=0.25)

    # Evaluate validation accuracy
    validation_accuracy = history.history['val_accuracy'][-1]
    print(f"Validation Accuracy: {validation_accuracy * 100}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [None]:
test_data_arr= test_data.to_numpy()

test_data_reshaped= test_data_arr.reshape((-1,28,28,1))

predictions= model_cnn.predict(test_data_reshaped)

In [None]:
import numpy as np
predicted_classes= np.argmax(predictions, axis=1)

In [None]:
results= pd.Series(predicted_classes, name="Label")
submission= pd.concat([pd.Series(range(1,28001), name="ImageId"), results], axis=1)
submission.to_csv("submission.csv", index=False)

In [None]:
submission