<a href="https://colab.research.google.com/github/Jason-Oleana/written-spoken-digits-cnn-classification/blob/master/ml_assignment_Final_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning Assignment

## Imports

In [0]:
import numpy
import pandas as pd
from sklearn.model_selection import train_test_split  
from scipy.stats import kurtosis

## Loading Data

In [41]:
written_train = numpy.load("/content/drive/My Drive/Data Science/machine learning/written_train(1).npy", allow_pickle=True)
spoken_train = numpy.load("/content/drive/My Drive/Data Science/machine learning/spoken_train(1).npy", allow_pickle=True)
match_train = numpy.load("/content/drive/My Drive/Data Science/machine learning/match_train(1).npy", allow_pickle=True)

print("written train shape:", written_train.shape)
print("spoken train shape:", spoken_train.shape)
print("match train shape:", match_train.shape)

written train shape: (45000, 784)
spoken train shape: (45000,)
match train shape: (45000,)


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [42]:
largest_shape = 0
for i in spoken_train:
    shape = i.shape[0]
    if shape > largest_shape:
        largest_shape = shape

print(largest_shape)

93


In [43]:
new_spoken_train = []
for example in spoken_train:
    difference = largest_shape-example.shape[0]
    zero_pad = numpy.pad(example,((0,difference),(0,0)), mode='constant')
    new_spoken_train.append(zero_pad)

new_spoken_train = numpy.array(new_spoken_train)
new_spoken_train.shape

(45000, 93, 13)

In [9]:
print(93*13)

1209


In [0]:
spoken_train = new_spoken_train
spoken_train = numpy.reshape(spoken_train,(45000,1209))

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(spoken_train)
spoken_train = scaler.transform(spoken_train)

# Normalize pixel values to be between 0 and 1
- divide written_train by 255

In [0]:
X_written = written_train/255
X_spoken = spoken_train
y = match_train

## Label distribution

In [48]:
total = len(y)
positives = sum(y)
negatives = total - positives

print("number of total rows: {}".format(total))
print("number of positives: {} ({}%)".format(positives, round((positives/total)*100,2)))
print("number of negatives: {} ({}%)".format(negatives, round((negatives/total)*100,2)))

number of total rows: 45000
number of positives: 4539 (10.09%)
number of negatives: 40461 (89.91%)


In [0]:
X_written = numpy.reshape(X_written,(45000,784))
X_spoken = numpy.reshape(X_spoken,(45000,1209))

In [50]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_written, y1 = oversampler.fit_resample(X_written, y)
X_spoken, y2 = oversampler.fit_resample(X_spoken, y)
print('Resampled dataset shape %s' % Counter(y2))



Resampled dataset shape Counter({False: 40461, True: 40461})


In [51]:

from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example
values = array(y2)
print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)
# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(inverted)

[False False False ...  True  True  True]
[0 0 0 ... 1 1 1]
[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]
[False]


In [52]:
print(X_written.shape)
print(X_spoken.shape)
print(y2.shape)

(80922, 784)
(80922, 1209)
(80922,)


# step 1: split data in training and validation:
- written train split: 80% train, 20% validation
<br>
- spoken train split: 80% train, 20% validation

Since the data is imbalanced, we use stratify to make sure the distribution of labels is the same in our train and validation datasets

In [0]:
X_written_train, X_written_test, X_spoken_train, X_spoken_test, y_train, y_valid = train_test_split(X_written, X_spoken, y2, test_size=0.20, stratify = y2)

In [54]:
X_written_test.shape

(16185, 784)

In [0]:
X_written_test = numpy.reshape(X_written_test,(X_written_test.shape[0],28,28))
X_written_train = numpy.reshape(X_written_train,(X_written_train.shape[0],28,28))
X_spoken_test = numpy.reshape(X_spoken_test,(X_spoken_test.shape[0],93,13))
X_spoken_train = numpy.reshape(X_spoken_train,(X_spoken_train.shape[0],93,13))

In [57]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
#create model
import keras
from keras.optimizers import Adam

input1 = keras.layers.Input(shape=(28,28))
x1 = keras.layers.Conv1D(32, 2, activation='relu')(input1)
x1 = keras.layers.MaxPooling1D(2)(x1)
x2 = keras.layers.Conv1D(32, 2, activation='relu')(x1)
#x2 = keras.layers.MaxPooling1D(2)(x2)
x3 = keras.layers.Flatten()(x2)
input2 = keras.layers.Input(shape=(93,13))
y1 = keras.layers.Conv1D(32, 2, activation='relu')(input2)
y1 = keras.layers.MaxPooling1D(2)(y1)
y2 = keras.layers.Conv1D(32, 2, activation='relu')(y1)
#y2 = keras.layers.MaxPooling1D(2)(y2)
y3 = keras.layers.Flatten()(y2)
# Equivalent to subtracted = keras.layers.subtract([x1, x2])
concatenate = keras.layers.Concatenate()([x3, y3])
Dense_1 = Dense(100, activation='relu')(concatenate)
Dense_2 = Dense(100, activation='relu')(Dense_1)
out = keras.layers.Dense(1, activation = "sigmoid")(Dense_2)
model = keras.models.Model(inputs=[input1, input2], outputs=out)
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=0.001), metrics=['accuracy'])
print(model.summary())

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, 28, 28)       0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           (None, 93, 13)       0                                            
__________________________________________________________________________________________________
conv1d_25 (Conv1D)              (None, 27, 32)       1824        input_13[0][0]                   
__________________________________________________________________________________________________
conv1d_27 (Conv1D)              (None, 92, 32)       864         input_14[0][0]                   
____________________________________________________________________________________________

In [0]:
history = model.fit([X_written_train, X_spoken_train], y_train,
                    epochs=150, validation_split=0.20, batch_size=64, verbose=0)