<a href="https://colab.research.google.com/github/Jason-Oleana/written-spoken-digits-cnn-classification/blob/master/ml_assignment_Final_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Machine Learning Assignment

## Imports

In [0]:
import numpy
import pandas as pd
from sklearn.model_selection import train_test_split  
from scipy.stats import kurtosis

## Loading Data

In [41]:
written_train = numpy.load("/content/drive/My Drive/Data Science/machine learning/written_train(1).npy", allow_pickle=True)
spoken_train = numpy.load("/content/drive/My Drive/Data Science/machine learning/spoken_train(1).npy", allow_pickle=True)
match_train = numpy.load("/content/drive/My Drive/Data Science/machine learning/match_train(1).npy", allow_pickle=True)

print("written train shape:", written_train.shape)
print("spoken train shape:", spoken_train.shape)
print("match train shape:", match_train.shape)

written train shape: (45000, 784)
spoken train shape: (45000,)
match train shape: (45000,)


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [42]:
largest_shape = 0
for i in spoken_train:
    shape = i.shape[0]
    if shape > largest_shape:
        largest_shape = shape

print(largest_shape)

93


In [43]:
new_spoken_train = []
for example in spoken_train:
    difference = largest_shape-example.shape[0]
    zero_pad = numpy.pad(example,((0,difference),(0,0)), mode='constant')
    new_spoken_train.append(zero_pad)

new_spoken_train = numpy.array(new_spoken_train)
new_spoken_train.shape

(45000, 93, 13)

In [9]:
print(93*13)

1209


In [0]:
spoken_train = new_spoken_train
spoken_train = numpy.reshape(spoken_train,(45000,1209))

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(spoken_train)
spoken_train = scaler.transform(spoken_train)

# Normalize pixel values to be between 0 and 1
- divide written_train by 255

In [0]:
X_written = written_train/255
X_spoken = spoken_train
y = match_train

## Label distribution

In [48]:
total = len(y)
positives = sum(y)
negatives = total - positives

print("number of total rows: {}".format(total))
print("number of positives: {} ({}%)".format(positives, round((positives/total)*100,2)))
print("number of negatives: {} ({}%)".format(negatives, round((negatives/total)*100,2)))

number of total rows: 45000
number of positives: 4539 (10.09%)
number of negatives: 40461 (89.91%)


In [0]:
X_written = numpy.reshape(X_written,(45000,784))
X_spoken = numpy.reshape(X_spoken,(45000,1209))

In [50]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_written, y1 = oversampler.fit_resample(X_written, y)
X_spoken, y2 = oversampler.fit_resample(X_spoken, y)
print('Resampled dataset shape %s' % Counter(y2))



Resampled dataset shape Counter({False: 40461, True: 40461})


In [51]:

from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# define example
values = array(y2)
print(values)
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)
# invert first example
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
print(inverted)

[False False False ...  True  True  True]
[0 0 0 ... 1 1 1]
[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [0. 1.]
 [0. 1.]]
[False]


In [52]:
print(X_written.shape)
print(X_spoken.shape)
print(y2.shape)

(80922, 784)
(80922, 1209)
(80922,)


# step 1: split data in training and validation:
- written train split: 80% train, 20% validation
<br>
- spoken train split: 80% train, 20% validation

Since the data is imbalanced, we use stratify to make sure the distribution of labels is the same in our train and validation datasets

In [0]:
X_written_train, X_written_test, X_spoken_train, X_spoken_test, y_train, y_valid = train_test_split(X_written, X_spoken, y2, test_size=0.20, stratify = y2)

In [54]:
X_written_test.shape

(16185, 784)

In [0]:
X_written_test = numpy.reshape(X_written_test,(X_written_test.shape[0],28,28))
X_written_train = numpy.reshape(X_written_train,(X_written_train.shape[0],28,28))
X_spoken_test = numpy.reshape(X_spoken_test,(X_spoken_test.shape[0],93,13))
X_spoken_train = numpy.reshape(X_spoken_train,(X_spoken_train.shape[0],93,13))

In [57]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout
#create model
import keras
from keras.optimizers import Adam

input1 = keras.layers.Input(shape=(28,28))
x1 = keras.layers.Conv1D(32, 2, activation='relu')(input1)
x1 = keras.layers.MaxPooling1D(2)(x1)
x2 = keras.layers.Conv1D(32, 2, activation='relu')(x1)
#x2 = keras.layers.MaxPooling1D(2)(x2)
x3 = keras.layers.Flatten()(x2)
input2 = keras.layers.Input(shape=(93,13))
y1 = keras.layers.Conv1D(32, 2, activation='relu')(input2)
y1 = keras.layers.MaxPooling1D(2)(y1)
y2 = keras.layers.Conv1D(32, 2, activation='relu')(y1)
#y2 = keras.layers.MaxPooling1D(2)(y2)
y3 = keras.layers.Flatten()(y2)
# Equivalent to subtracted = keras.layers.subtract([x1, x2])
concatenate = keras.layers.Concatenate()([x3, y3])
Dense_1 = Dense(100, activation='relu')(concatenate)
Dense_2 = Dense(100, activation='relu')(Dense_1)
out = keras.layers.Dense(1, activation = "sigmoid")(Dense_2)
model = keras.models.Model(inputs=[input1, input2], outputs=out)
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=0.001), metrics=['accuracy'])
print(model.summary())

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, 28, 28)       0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           (None, 93, 13)       0                                            
__________________________________________________________________________________________________
conv1d_25 (Conv1D)              (None, 27, 32)       1824        input_13[0][0]                   
__________________________________________________________________________________________________
conv1d_27 (Conv1D)              (None, 92, 32)       864         input_14[0][0]                   
____________________________________________________________________________________________

In [0]:
history = model.fit([X_written_train, X_spoken_train], y_train,
                    epochs=150, validation_split=0.20, batch_size=64, verbose=0)

Check if the labels are indeed distributed equally
#### Train

In [0]:
total = len(y_train)
positives = sum(y_train)
negatives = total - positives

print("number of total rows: {}".format(total))
print("number of positives: {} ({}%)".format(positives, round((positives/total)*100,2)))
print("number of negatives: {} ({}%)".format(negatives, round((negatives/total)*100,2)))

number of total rows: 36000
number of positives: 3631 (10.09%)
number of negatives: 32369 (89.91%)


#### Validation

In [0]:
total = len(y_valid)
positives = sum(y_valid)
negatives = total - positives

print("number of total rows: {}".format(total))
print("number of positives: {} ({}%)".format(positives, round((positives/total)*100,2)))
print("number of negatives: {} ({}%)".format(negatives, round((negatives/total)*100,2)))

number of total rows: 9000
number of positives: 908 (10.09%)
number of negatives: 8092 (89.91%)


# Preprocessing / Feature Engineering

### Step 3: feature engineering for written data:
- written training data
<br>
- written validation data
<br>
Extract mean, min, max, std, count for written

In [0]:
def extract_summary_stats_written(X_written):
#.........................................
    #look at summaries list, mean, min, max, std are functions in function_list
    
    written_mean = []
    written_min = []
    written_max = []
    written_std = [] 
    written_kurtosis = []
    
    for element in X_written:
        written_mean.append(numpy.mean(element))
        written_min.append(numpy.min(element))
        written_max.append(numpy.max(element))
        written_std.append(numpy.std(element))
        written_kurtosis.append(kurtosis(element))
        
        
    return written_mean, written_min, written_max, written_std, written_kurtosis
#-----------------------------------------------------------------------------------------

In [0]:
X_train_written_preproc = extract_summary_stats_written(X_written_train)
X_valid_written_preproc = extract_summary_stats_written(X_written_valid)

### Step 3: feature engineering for spoken data:

- spoken training data 
<br>
- spoken validation data 
<br>
Extract mean, min, max, std for spoken

In [0]:
def extract_summary_stats_spoken(X_spoken):
#.........................................
    #look at summaries list, mean, min, max, std are functions in function_list
    spoken_mean = []
    spoken_min = []
    spoken_max = []
    spoken_std = []
    spoken_kurtosis = []
    for element in X_spoken:
        spoken_mean.append(numpy.mean(element))
        spoken_min.append(numpy.min(element))
        spoken_max.append(numpy.max(element))
        spoken_std.append(numpy.std(element))
        spoken_kurtosis.append(numpy.mean(kurtosis(element)))
    
    print(spoken_kurtosis[0])
    return spoken_mean, spoken_min, spoken_max, spoken_std, spoken_kurtosis

In [0]:
X_train_spoken_preproc = extract_summary_stats_spoken(X_spoken_train)
X_valid_spoken_preproc = extract_summary_stats_spoken(X_spoken_valid)

4.109109030524278
4.342230936714155


# Extract all summaries from written & spoken feature engineering data function

In [0]:
written_mean, written_min, written_max, written_std, written_kurtosis = X_train_written_preproc
written_val_mean, written_val_min, written_val_max, written_val_std, written_val_kurtosis = X_valid_written_preproc

spoken_mean, spoken_min, spoken_max, spoken_std, spoken_kurtosis = X_train_spoken_preproc
spoken_val_mean, spoken_val_min, spoken_val_max, spoken_val_std, spoken_val_kurtosis = X_valid_spoken_preproc

# Step 4: use summaries to Build a dataframe for:
- training set
<br>
- validation set

#### Training

In [0]:
X_train_final_cols = {'written_mean':pd.Series(written_mean),
                      'written_min':pd.Series(written_min),
                      'written_max':pd.Series(written_max),
                      'written_std':pd.Series(written_std),
                      'written_kurtosis':pd.Series(written_kurtosis),
                      'spoken_mean':pd.Series(spoken_mean),
                      'spoken_min':pd.Series(spoken_min),
                      'spoken_max':pd.Series(spoken_max), 
                      'spoken_std':pd.Series(spoken_std),
                      'spoken_kurtosis':pd.Series(spoken_kurtosis)}

X_train_final = pd.DataFrame(X_train_final_cols)
X_train_final.head(10)

Unnamed: 0,spoken_kurtosis,spoken_max,spoken_mean,spoken_min,spoken_std,written_kurtosis,written_max,written_mean,written_min,written_std
0,4.109109,3.746517,0.037989,-2.907489,0.640406,6.258335,1.0,0.908298,0.0,0.252658
1,3.147835,3.41047,0.142919,-2.815631,0.707371,1.328988,1.0,0.830687,0.0,0.334427
2,3.937611,3.16749,0.038646,-3.934161,0.699209,1.112804,1.0,0.824165,0.0,0.347787
3,3.697519,2.580729,-0.058139,-3.520332,0.759402,1.219977,1.0,0.830112,0.0,0.344895
4,2.118711,1.994407,-0.002166,-3.159704,0.6787,14.635369,1.0,0.945678,0.0,0.195685
5,2.371506,3.44273,-0.101005,-2.704327,0.718457,1.212908,1.0,0.829802,0.0,0.339582
6,3.966335,3.040929,0.095253,-2.696737,0.641992,3.244121,1.0,0.871734,0.0,0.29773
7,4.278932,1.964991,0.098736,-3.152634,0.533742,0.622869,1.0,0.807908,0.0,0.368778
8,3.266938,2.18106,0.122408,-2.323979,0.483376,2.016342,1.0,0.85027,0.0,0.320058
9,2.270484,3.599731,-0.048192,-3.328103,0.829805,4.899556,1.0,0.895038,0.0,0.270636


### Validation

In [0]:
X_val_final_cols = {'written_mean':pd.Series(written_val_mean),
                    'written_min':pd.Series(written_val_min),
                    'written_max':pd.Series(written_val_max),
                    'written_std':pd.Series(written_val_std),
                    'written_kurtosis':pd.Series(written_val_kurtosis),
                    'spoken_mean':pd.Series(spoken_val_mean),
                    'spoken_min':pd.Series(spoken_val_min),
                    'spoken_max':pd.Series(spoken_val_max), 
                    'spoken_std':pd.Series(spoken_val_std),
                    'spoken_kurtosis':pd.Series(spoken_val_kurtosis)}

X_val_final = pd.DataFrame(X_val_final_cols)
X_val_final.head(10)

Unnamed: 0,spoken_kurtosis,spoken_max,spoken_mean,spoken_min,spoken_std,written_kurtosis,written_max,written_mean,written_min,written_std
0,4.342231,1.83392,0.007023,-4.246489,0.685025,4.498108,1.0,0.892832,0.0,0.265955
1,2.926089,2.936393,-0.023044,-2.557086,0.635517,7.754924,1.0,0.917987,0.0,0.244437
2,3.105262,2.700852,0.023705,-2.503429,0.638709,3.330619,1.0,0.873865,0.0,0.294804
3,3.368004,2.241257,0.035596,-2.972858,0.611129,2.081774,1.0,0.851045,0.0,0.321656
4,2.564494,2.480719,0.003324,-2.887304,0.733821,6.162422,1.0,0.905072,0.0,0.256116
5,6.815722,6.294796,-0.260403,-7.697091,1.168023,1.861682,1.0,0.845928,0.0,0.329028
6,5.997747,2.323132,0.091725,-3.508085,0.635473,3.375232,1.0,0.875005,0.0,0.298929
7,3.961801,3.352081,0.08623,-2.514017,0.627304,10.907311,1.0,0.932923,0.0,0.229954
8,5.340738,2.643023,0.147902,-2.533336,0.502528,7.467469,1.0,0.919153,0.0,0.222012
9,3.798865,3.198134,0.118474,-2.424817,0.612269,6.462865,1.0,0.910324,0.0,0.257152


# Oversampling
we want to upsample our true examples to match our false examples.
<br>
therefore we used "imblearn.over_sampling" to get a new dataset consisting out of:
<br>
50% true and 50% false 

In [0]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

print('Original dataset shape %s' % Counter(y_train))

Original dataset shape Counter({False: 32369, True: 3631})


In [0]:
oversampler = RandomOverSampler(random_state=42)
X_train_final_resampled, y_train_resampled = oversampler.fit_resample(X_train_final, y_train)
print('Resampled dataset shape %s' % Counter(y_train_resampled))

Resampled dataset shape Counter({False: 32369, True: 32369})


#### pass valid input, resampled input and label back to:

- x_valid = X_val_final
- x_train_final = input
<br>
- y_train = label

In [0]:
X_train_final = X_train_final_resampled
X_valid = X_val_final
y_train = y_train_resampled

# Machine Learning

In [0]:
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import Normalizer, StandardScaler  
from sklearn.model_selection import GridSearchCV

In [0]:
random_seed = 1234

### Baseline

Always predict the most frequent class (`False`)

In [0]:
classifier = DummyClassifier(random_state=random_seed, strategy="most_frequent")
classifier.fit(X_train_final, y_train) 

y_pred = classifier.predict(X_val_final)
conf_matrix = confusion_matrix(y_valid, y_pred)
clf_report = classification_report(y_valid, y_pred)
accuracy = accuracy_score(y_valid, y_pred)

print(conf_matrix)
print(clf_report) 
print("accuracy: ", accuracy)

[[8092    0]
 [ 908    0]]
              precision    recall  f1-score   support

       False       0.90      1.00      0.95      8092
        True       0.00      0.00      0.00       908

    accuracy                           0.90      9000
   macro avg       0.45      0.50      0.47      9000
weighted avg       0.81      0.90      0.85      9000

accuracy:  0.8991111111111111


  _warn_prf(average, modifier, msg_start, len(result))


# Feature Scaling

In [0]:
scaler = Normalizer()  
scaler.fit(X_train_final)
X_train_final = scaler.transform(X_train_final)  
X_valid = scaler.transform(X_valid) 

# Train, Predict and Evaluate

### KNN

In [0]:
classifier = KNeighborsClassifier(n_neighbors=15)  
classifier.fit(X_train_final, y_train) 

y_pred = classifier.predict(X_val_final)
conf_matrix = confusion_matrix(y_valid, y_pred)
clf_report = classification_report(y_valid, y_pred)
accuracy = accuracy_score(y_valid, y_pred)

print(conf_matrix)
print(clf_report) 
print("accuracy: ", accuracy)

[[4394 3698]
 [ 489  419]]
              precision    recall  f1-score   support

       False       0.90      0.54      0.68      8092
        True       0.10      0.46      0.17       908

    accuracy                           0.53      9000
   macro avg       0.50      0.50      0.42      9000
weighted avg       0.82      0.53      0.63      9000

accuracy:  0.5347777777777778


### SVC 

In [0]:
classifier = SVC(gamma = 'auto', random_state=random_seed) 
classifier.fit(X_train_final, y_train) 

y_pred = classifier.predict(X_val_final)
conf_matrix = confusion_matrix(y_valid, y_pred)
clf_report = classification_report(y_valid, y_pred)
accuracy = accuracy_score(y_valid, y_pred)

print(conf_matrix)
print(clf_report) 
print("accuracy: ", accuracy)

KeyboardInterrupt: 

### Random Forest

In [0]:
classifier = RandomForestClassifier(n_estimators = 200, random_state=random_seed, n_jobs=-2)
classifier.fit(X_train_final, y_train) 

y_pred = classifier.predict(X_val_final)
conf_matrix = confusion_matrix(y_valid, y_pred)
clf_report = classification_report(y_valid, y_pred)
accuracy = accuracy_score(y_valid, y_pred)

print(conf_matrix)
print(clf_report) 
print("accuracy: ", accuracy)

[[8092    0]
 [ 908    0]]
              precision    recall  f1-score   support

       False       0.90      1.00      0.95      8092
        True       0.00      0.00      0.00       908

   micro avg       0.90      0.90      0.90      9000
   macro avg       0.45      0.50      0.47      9000
weighted avg       0.81      0.90      0.85      9000

accuracy:  0.8991111111111111


  'precision', 'predicted', average, warn_for)


### Logistic Regression

In [0]:
classifier = LogisticRegression(random_state=random_seed)
classifier.fit(X_train_final, y_train) 

y_pred = classifier.predict(X_val_final)
conf_matrix = confusion_matrix(y_valid, y_pred)
clf_report = classification_report(y_valid, y_pred)
accuracy = accuracy_score(y_valid, y_pred)

print(conf_matrix)
print(clf_report) 
print("accuracy: ", accuracy)

[[8092    0]
 [ 908    0]]
              precision    recall  f1-score   support

       False       0.90      1.00      0.95      8092
        True       0.00      0.00      0.00       908

   micro avg       0.90      0.90      0.90      9000
   macro avg       0.45      0.50      0.47      9000
weighted avg       0.81      0.90      0.85      9000

accuracy:  0.8991111111111111


  'precision', 'predicted', average, warn_for)


### MultiLayerPerceptron

In [0]:
classifier = MLPClassifier(random_state=random_seed)
classifier.fit(X_train_final, y_train) 

y_pred = classifier.predict(X_val_final)
conf_matrix = confusion_matrix(y_valid, y_pred)
clf_report = classification_report(y_valid, y_pred)
accuracy = accuracy_score(y_valid, y_pred)

print(conf_matrix)
print(clf_report) 
print("accuracy: ", accuracy)

[[8092    0]
 [ 908    0]]
              precision    recall  f1-score   support

       False       0.90      1.00      0.95      8092
        True       0.00      0.00      0.00       908

   micro avg       0.90      0.90      0.90      9000
   macro avg       0.45      0.50      0.47      9000
weighted avg       0.81      0.90      0.85      9000

accuracy:  0.8991111111111111


  'precision', 'predicted', average, warn_for)


# Gridsearch

all classifiers perform poorly.. RF performs slightly better... now try gridsearch over the hyperparameters

The following cell takes quite a long time (~1hr) to run, you can also skip this cell and read the results in the next cell.

In [0]:
classifier = RandomForestClassifier(n_estimators=200, random_state=random_seed, n_jobs=-2)

param_grid = {"max_depth": [5, 10, 15, 20, 25],
              "min_samples_split": [2, 5, 10, 25, 35, 50, 100, 250],
              "min_samples_leaf": [1, 5, 10, 35, 100]}

gridsearch = GridSearchCV(estimator=classifier,
                          param_grid=param_grid,
                          scoring=["accuracy", "roc_auc"],
                          cv=5, 
                          n_jobs=-2, 
                          verbose=5,
                          return_train_score=False,
                          refit=False)

gridsearch.fit(X_train_final, y_train) 

cv_results = pd.DataFrame(gridsearch.cv_results_).sort_values(by=["mean_test_accuracy","mean_test_roc_auc"], ascending=False)
cv_results.to_csv("gridsearch_results.csv")
cv_results[["params", "mean_test_roc_auc", "mean_test_accuracy"]]

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-2)]: Done  58 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-2)]: Done 148 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-2)]: Done 274 tasks      | elapsed: 15.8min


KeyboardInterrupt: 

In [0]:
cv_results = pd.read_csv("gridsearch_results.csv")

In [0]:
cv_results[["params", "mean_test_roc_auc", "mean_test_accuracy"]]

## Refit on Entire Dataset

- preprocess the test data with `extract_summary_stats` functions and repeat steps 2,3,4
- do the same for the entire training dataset, so that there is no train/val split anymore
- fit model with best parameters on the entire dataset (train + val) 
- predict on test set
- submit results

#### Load test data

In [0]:
X_test_written = numpy.load("written_test.npy", allow_pickle=True)
X_test_spoken = numpy.load("spoken_test.npy", allow_pickle=True)

#### Load train data

In [0]:
written_train_full = numpy.load("written_train.npy", allow_pickle=True)
spoken_train_full = numpy.load("spoken_train.npy", allow_pickle=True)
y_train_full = numpy.load("match_train.npy", allow_pickle=True)

#### step 2: Preprocess test data


In [0]:
X_test_written_full = extract_summary_stats_written(X_test_written)
X_test_spoken_full = extract_summary_stats_spoken(X_test_spoken)

0.24759229427010582


#### step 2: Preprocess train data

In [0]:
X_train_written_full = extract_summary_stats_written(written_train_full)
X_train_spoken_full = extract_summary_stats_spoken(spoken_train_full)

-0.0050469198705037546


#### step 3: extract all summaries for test data

In [0]:
written_test_mean, written_test_min, written_test_max, written_test_std, written_test_kurtosis = X_test_written_full
spoken_test_mean, spoken_test_min, spoken_test_max, spoken_test_std, spoken_test_kurtosis = X_test_spoken_full

#### step 3: extract all summaries for train data

In [0]:
written_mean_full, written_min_full, written_max_full, written_std_full, written_kurtosis_full = X_train_written_full
spoken_mean_full, spoken_min_full, spoken_max_full, spoken_std_full, spoken_kurtosis_full = X_train_spoken_full

#### step 4: create panda frame for test

In [0]:
X_test_final_cols = {'written_mean':pd.Series(written_test_mean),
                      'written_min':pd.Series(written_test_min),
                      'written_max':pd.Series(written_test_max),
                      'written_std':pd.Series(written_test_std),
                      'written_kurtosis':pd.Series(written_test_kurtosis),
                      'spoken_mean':pd.Series(spoken_test_mean),
                      'spoken_min':pd.Series(spoken_test_min),
                      'spoken_max':pd.Series(spoken_test_max), 
                      'spoken_std':pd.Series(spoken_test_std),
                      'spoken_kurtosis':pd.Series(spoken_test_kurtosis)}


X_test_final = pd.DataFrame(X_test_final_cols)

#### step 4: create panda frame for train

In [0]:
X_train_final_full = {'written_mean':pd.Series(written_mean_full),
                      'written_min':pd.Series(written_min_full),
                      'written_max':pd.Series(written_max_full),
                      'written_std':pd.Series(written_std_full),
                      'written_kurtosis':pd.Series(written_kurtosis_full),
                      'spoken_mean':pd.Series(spoken_mean_full),
                      'spoken_min':pd.Series(spoken_min_full),
                      'spoken_max':pd.Series(spoken_max_full), 
                      'spoken_std':pd.Series(spoken_std_full),
                      'spoken_kurtosis':pd.Series(spoken_kurtosis_full)}


X_train_full = pd.DataFrame(X_train_final_full)

In [0]:
from imblearn.over_sampling import RandomOverSampler
oversampler = RandomOverSampler(random_state=42)
X_train_final_resampled, y_train_resampled = oversampler.fit_resample(X_train_full, y_train_full)
print('Resampled dataset shape %s' % Counter(y_train_resampled))

Resampled dataset shape Counter({False: 40461, True: 40461})


#### pass full training data, training label & test data to below names:

In [0]:
X_test_full = X_test_final
X_train_full = X_train_final_resampled
y_train_full = y_train_resampled

#### Fit with best parameters on all training data

In [0]:
best_params = cv_results.best_params
classifier = RandomForestClassifier(**best_params, n_estimators=200, random_state=random_seed, n_jobs=-2)

classifier.fit(X_train_full, y_train_full)

y_pred = classifier.predict(X_test_full)
numpy.save("result", y_pred)