# ANBW_v3
- To do
    - Improve model performance
    - Overfitting Troubleshooting
    - Reusable Pipeline Configuration
    - Apply appropriate garbage collecting

## Setup

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3"

In [2]:
import gc
import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split

In [3]:
print("CUDA avilable:",
     True if tf.config.list_physical_devices("GPU") else False)

CUDA avilable: True


In [4]:
SEED = 1234
np.random.seed(SEED)
tf.random.set_seed(SEED)

## Load data

In [None]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
# submission = pd.read_csv("./data/submission.csv")

In [None]:
img_save_path = "./img/v3/"

if not(os.path.isdir(img_save_path)):
    os.makedirs(os.path.join(img_save_path))
    print("Directory Created: %s" % img_save_path)
else:
    print("Directory already exists: %s" % img_save_path)

## EDA

### Plot saving function

In [None]:
def save_plot(img_path: str) -> None:
    if not(os.path.isfile(img_path)):
        plt.savefig(img_path, facecolor="#eeeeee", bbox_inches="tight")
        print("successfully image saved: %s" % img_path)
    else:
        print("Directory already exists: %s" % img_path)

### Train data

In [None]:
train.set_index("id").head()

In [None]:
train_digit_cnt = train["digit"].value_counts(normalize=False, sort=False).values
train_digit_label = sorted(train["digit"].unique())

In [None]:
plt.bar(train_digit_label, train_digit_cnt)
plt.title("Train data's digit distribution")
plt.xticks(train_digit_label)
plt.xlabel("digit")
plt.ylabel("count")

img_path = img_save_path + "train_digit_distribution.png"
save_plot(img_path)

In [None]:
train.shape

### Test data

In [None]:
test.set_index("id").head()

In [None]:
test.shape

### Data visualization

In [None]:
plt.figure(figsize=(10, 10))
for idx in range(9):
    ax = plt.subplot(3, 3, idx+1)
    img = train.loc[idx, "0": ].values.reshape(28, 28).astype(int)
    plt.imshow(img)
    plt.title("Digit: %s, Letter: %s" % (train.loc[idx, "digit"], train.loc[idx, "letter"]))
    plt.axis("off")

img_path = img_save_path + "train_visualize.png"
save_plot(img_path)

In [None]:
del ax, idx, img, img_path, img_save_path, train, test, train_digit_cnt, train_digit_label
gc.collect()

## Data preprocessing

In [None]:
def train_generator(path: str) -> (tf.float32, tf.int64):
    """
    Train Image data preproccesing and generating
    1. Read CSV file with Pandas
    2. Slice the feature data(pixels) and label(digit)
    3. Convert Color from grayscale to RGB
    4. Resize image from (28, 28) to (300, 300)
    5. Convert labels to one hot encoding array and return with preprocessed image
    """
    raw_data = pd.read_csv(path)
    features = raw_data.loc[:, "0":].values
    features = features.reshape(-1, 28, 28, 1)
    features = np.where((features<=20)&(features!=0), 0., features)
    features = features / 255
    features = features.astype("float32")
    labels = raw_data["digit"]
    
    for feat, label in zip(features, labels):
        cvt_feat = cv2.cvtColor(feat, cv2.COLOR_GRAY2RGB)
        resized_feat = cv2.resize(cvt_feat, (300, 300), interpolation=cv2.INTER_CUBIC)
        label = tf.keras.utils.to_categorical(label, 10)
        
        yield(resized_feat, label)

In [None]:
def test_generator(path: str) -> (tf.float32):
    """
    Test Image data preproccesing and generating
    1. Read CSV file with Pandas
    2. Slice the feature data(pixels)
    3. Convert Color from grayscale to RGB
    4. Resize image from (28, 28) to (300, 300)
    5. Return preprocessed image
    """
    raw_data = pd.read_csv(path)
    features = raw_data.loc[:, "0":].values
    features = features.reshape(-1, 28, 28, 1)
    features = np.where((features<=20)&(features!=0), 0., features)
    features = features / 255
    features = features.astype("float32")
    
    for feat, label in zip(features, labels):
        cvt_feat = cv2.cvtColor(feat, cv2.COLOR_GRAY2RGB)
        resized_feat = cv2.resize(cvt_feat, (300, 300), interpolation=cv2.INTER_CUBIC)
        
        yield(resized_feat)

In [None]:
train_dataset = tf.data.Dataset.from_generator(train_generator, (tf.float32, tf.int64), ((300,300, 3), (10)), args=("./data/train.csv"))
test_dataset = tf.data.Dataset.from_generator(test_generator, tf.float32, (300,300, 3), args=("./data/test.csv"))