## STAT 441 Data Challenge 

Statistical Learning Course In-class Kaggle Competition – Fashion Image Classification (Computer Vision)     
• Built deep learning models in Python to classify the fashion clothing image based on Fashion-MNIST dataset   
• Preprocessed image data and increased the diversity of image data with data augmentation techniques   
• Fit convolutional neural network (CNN) to predict the labels of image categories with Keras and Tensorflow   
• Applied dropout method to avoid overfitting and achieved 94.5% image classification accuracy    

### Import Library

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectFromModel
import lightgbm as lgb
import seaborn as sns
import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torchvision import datasets, transforms
from tqdm import tqdm
%matplotlib inline

In [2]:
# Modelling Helpers :
from sklearn.preprocessing import Imputer , Normalizer , scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV , KFold , cross_val_score

#preprocessing :
from sklearn.preprocessing import MinMaxScaler , StandardScaler, Imputer, LabelEncoder

#evaluation metrics :

# Regression
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error 
# Classification
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  


# Deep Learning Libraries
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from keras.optimizers import Adam,SGD,Adagrad,Adadelta,RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau, LearningRateScheduler
from keras.utils import to_categorical

### Load dataset

In [3]:
os.getcwd()

'C:\\Users\\qingy\\Desktop\\STAT 441\\Data Challenge 2'

In [4]:
path = os.path.join(os.getcwd(), 'Data')
path

'C:\\Users\\qingy\\Desktop\\STAT 441\\Data Challenge 2\\Data'

In [5]:
# read train data
train = pd.read_csv(os.path.join(path, 'image_train_Kaggle.csv'))
train.shape

(60000, 785)

In [6]:
# read test data
test = pd.read_csv(os.path.join(path, 'image_test_Kaggle.csv'))
test.shape

(10000, 785)

In [7]:
train.head()

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,0,0,0,0,0,0,0,5,0,...,0,0,0,30,43,0,0,0,0,0
3,0,0,0,0,1,2,0,0,0,0,...,3,0,0,0,0,1,0,0,0,0
4,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
train.dtypes

label       int64
pixel1      int64
pixel2      int64
pixel3      int64
pixel4      int64
            ...  
pixel780    int64
pixel781    int64
pixel782    int64
pixel783    int64
pixel784    int64
Length: 785, dtype: object

In [9]:
train.describe().astype(int)

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
count,60000,60000,60000,60000,60000,60000,60000,60000,60000,60000,...,60000,60000,60000,60000,60000,60000,60000,60000,60000,60000
mean,4,0,0,0,0,0,0,0,2,5,...,34,23,16,17,22,17,8,2,0,0
std,2,0,0,1,2,4,5,8,14,23,...,57,48,41,43,51,45,29,17,9,2
min,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25%,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50%,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
75%,7,0,0,0,0,0,0,0,0,0,...,58,9,0,0,0,0,0,0,0,0
max,9,16,36,226,164,227,230,224,255,254,...,255,255,255,255,255,255,255,255,255,170


Missing summary

In [10]:
# missing summary
MissingSummary = pd.concat([train.dtypes, train.isnull().any(axis=0), train.isnull().sum(axis=0)], 
                           axis=1)
# The axis to concatenate along
MissingSummary.columns = ['FeatureType','IsMissing', 'MissingCnt']
MissingSummary

Unnamed: 0,FeatureType,IsMissing,MissingCnt
label,int64,False,0
pixel1,int64,False,0
pixel2,int64,False,0
pixel3,int64,False,0
pixel4,int64,False,0
...,...,...,...
pixel780,int64,False,0
pixel781,int64,False,0
pixel782,int64,False,0
pixel783,int64,False,0


In [11]:
# No missing value
train.columns[train.isnull().any()]

Index([], dtype='object')

In [12]:
# check label distribution - balanced
train.label.value_counts(dropna=False)

9    6000
8    6000
7    6000
6    6000
5    6000
4    6000
3    6000
2    6000
1    6000
0    6000
Name: label, dtype: int64

## Prepare Data

In [13]:
# save copy
train_data=train.copy()
test_data=test.copy()

In [14]:
# reshape function
from keras.utils import to_categorical
def prepare_data_array(train):
    train_data=train.copy()
    X_train = train_data.drop("label", axis=1) 
    y_train = train_data["label"].copy()
    X_train =  X_train.as_matrix()
    X_train = X_train.reshape(X_train.shape[0], 28, 28, 1)
    X_train = X_train.astype('float32')
    X_train /= 255
    
    return X_train, y_train

In [15]:

def prepare_testdata_array(train):
    train_data=train.copy()
    X_train = train_data.drop("ID", axis=1) 
    y_train = train_data["ID"].copy()
    X_train =  X_train.as_matrix()
    X_train = X_train.reshape(X_train.shape[0], 28, 28, 1)
    X_train = X_train.astype('float32')
    X_train /= 255
    
    return X_train, y_train

### Train Test Split

In [16]:
def split(df):
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=824)
    for train_index, test_index in split.split(df, df['label']):
        strat_train_set = df.loc[train_index]
        strat_test_set = df.loc[test_index]
    return strat_train_set, strat_test_set

In [17]:
train_data=train.copy()
train1=split(train_data)[0]
test1=split(train_data)[1]

In [18]:
train1["label"].value_counts() / len(train1)

9    0.1
8    0.1
7    0.1
6    0.1
5    0.1
4    0.1
3    0.1
2    0.1
1    0.1
0    0.1
Name: label, dtype: float64

In [19]:
test1["label"].value_counts() / len(test1)

7    0.1
6    0.1
5    0.1
4    0.1
3    0.1
2    0.1
9    0.1
1    0.1
8    0.1
0    0.1
Name: label, dtype: float64

# Model Fit - CNN

In [20]:
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

def build_model(X_train,y_train):
    
    model = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(32, (3,3), padding='same', strides=1, activation='relu',input_shape=(28,28,1)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Conv2D(64, (3,3), padding='same', strides=1, activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=(2,2)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv2D(128, (3,3), padding='same', strides=1, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(1024, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(10, activation='softmax')
])

    optimizer = Adam(lr=0.001) 
    model.compile(optimizer=optimizer, 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
    
           
    return model
    

In [21]:
train.shape

(60000, 785)

#### Using Train/Test Split data

In [22]:
(X_train1,y_train1)= prepare_data_array(train1)
(X_test1,y_test1)= prepare_data_array(test1)

### Input Augmentation

In [None]:
reduce_lr = LearningRateScheduler(lambda x: 1e-3 * 0.9 ** x)
from keras.preprocessing.image import ImageDataGenerator
dataarg = ImageDataGenerator(
        rotation_range = 8,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.1, # Randomly zoom image 
        shear_range = 0.3,# shear angle in counter-clockwise direction in degrees  
        width_shift_range=0.08,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.08,  # randomly shift images vertically (fraction of total height)
        vertical_flip=True)  # randomly flip images

### Fit Model

In [23]:
dataarg.fit(X_train1)
m8=build_model(X_train1,y_train1)
reduce_lr = LearningRateScheduler(lambda x: 1e-3 * 0.9 ** x)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1, mode='auto')
m8.fit(dataarg.flow(X_train1,y_train1, epochs=20,verbose=1,batch_size =64),validation_data=(X_test1,y_test1),callbacks = [reduce_lr, early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x19b94f20848>

## Predict

#### use m8

In [24]:
test_data=test.copy()
(X_test, y_test) = prepare_testdata_array(test_data)
label_test=m8.predict(X_test)
label_test=pd.DataFrame(label_test)
label_df=pd.concat([test_data['ID'],label_test],axis=1)

### Output

In [25]:
label_df.to_csv(os.path.join(path, 'prob_test_Dec19test2.csv'),index=False)