# Exploratory data analysis

In [1]:
import os
from config import get_cfg_defaults
import pandas as pd

from utils import setup_determinism

cfg = get_cfg_defaults()
cfg.freeze()

setup_determinism(cfg.SYSTEM.SEED)

In [2]:
cfg.CONST.LABELS

['airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck']

## Generate CSV

In [3]:
def gen_csv(mode='train'):
    """
        Generate kaggle csv format dataframe (id, label) 
        label: index in LABELS config
    """
    data = os.listdir(os.path.join(cfg.DIRS.DATA, mode))
    labels = list(map(lambda x: cfg.CONST.LABELS.index(x.split('.')[0].split('_')[1]), data))
    df = pd.DataFrame(zip(data, labels), columns=['id', 'label'])
    df.to_csv(os.path.join(cfg.DIRS.DATA, f'{mode}.csv'), index=False)

gen_csv('train')
gen_csv('test')

## Analysis

In [4]:
df = pd.read_csv(os.path.join(cfg.DIRS.DATA, 'train.csv'))

In [5]:
df.head()

Unnamed: 0,id,label
0,20932_cat.png,3
1,11680_cat.png,3
2,18583_automobile.png,1
3,30075_horse.png,7
4,29692_cat.png,3


In [6]:
grouped_df = df.groupby('label')
grouped_df.count()

Unnamed: 0_level_0,id
label,Unnamed: 1_level_1
0,5000
1,5000
2,5000
3,5000
4,5000
5,5000
6,5000
7,5000
8,5000
9,5000


In [7]:
grouped_df.get_group(0).reset_index(drop=True)

Unnamed: 0,id,label
0,16191_airplane.png,0
1,32025_airplane.png,0
2,29480_airplane.png,0
3,20790_airplane.png,0
4,3721_airplane.png,0
...,...,...
4995,13312_airplane.png,0
4996,46632_airplane.png,0
4997,41235_airplane.png,0
4998,7196_airplane.png,0


## K-fold split

In [8]:
from sklearn.model_selection import StratifiedKFold

In [9]:
skf = StratifiedKFold(n_splits=5, random_state=cfg.SYSTEM.SEED, shuffle=False)

In [10]:
X, y = df['id'], df['label']

In [11]:
folds_path = os.path.join(cfg.DIRS.DATA, 'folds')
if not os.path.isdir(folds_path):
    os.mkdir(folds_path)

In [12]:
for fold_idx, (train_index, val_index) in enumerate(skf.split(X, y)):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    df_train = pd.DataFrame(zip(X_train, y_train), columns=('id', 'label'))
    df_val = pd.DataFrame(zip(X_val, y_val), columns=('id', 'label'))
    df_train.to_csv(os.path.join(folds_path, f'train_fold{fold_idx}.csv'), index=False)
    df_val.to_csv(os.path.join(folds_path, f'valid_fold{fold_idx}.csv'), index=False)