In [7]:
# general packages
import os
import cv2
import gc
import math
import random
import warnings
import numpy as np
import pandas as pd
from PIL import Image
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

#sklearns
from sklearn.metrics import cohen_kappa_score, accuracy_score
from sklearn.model_selection import train_test_split

# keras modules
import tensorflow as tf
import keras
from keras.applications.densenet import DenseNet121, DenseNet169, DenseNet201
from keras.optimizers import Adam, Nadam, SGD
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Model, load_model, Sequential
from keras.layers import Dense, GlobalAveragePooling2D, Dropout, Conv2D, GlobalMaxPooling2D, concatenate
from keras.layers import (MaxPooling2D, Input, Average, Activation, MaxPool2D,
                          Flatten, LeakyReLU, BatchNormalization)
from keras import models
from keras import layers
# from keras.preprocessing.image import load_img
# from keras.preprocessing.image import img_to_array

from keras.utils import Sequence
from keras import utils as np_utils
from keras.callbacks import (Callback, ModelCheckpoint,
                             LearningRateScheduler, EarlyStopping,
                             ReduceLROnPlateau, CSVLogger)

from dask import dataframe as dd
from dask.distributed import Client

warnings.simplefilter('ignore')
sns.set_style('whitegrid')

In [8]:
def readHugeCsvFileAsDataFrame(file_path):
    dask_df = dd.read_csv(file_path, low_memory=False, blocksize=50000)  # 50MB chunk-size
    return dask_df.compute()

In [9]:
path_to_file = 'D:\\ML_Sessional\\train.csv'
scheduler = Client()
df = scheduler.submit(readHugeCsvFileAsDataFrame, path_to_file)
workingDataFrame = df.result()

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme
0,Train_0,15,9,5,ক্ট্রো
1,Train_1,159,0,0,হ
2,Train_2,22,3,5,খ্রী
3,Train_3,53,2,2,র্টি
4,Train_4,71,9,5,থ্রো
...,...,...,...,...,...
990,Train_200835,22,7,2,র্খে
991,Train_200836,65,9,0,ত্তো
992,Train_200837,2,1,4,অ্যা
993,Train_200838,152,9,0,স্নো


In [10]:
from sklearn.utils import shuffle
workingDataFrame = shuffle(workingDataFrame)
workingDataFrame.reset_index(inplace=True, drop=True)

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme
0,Train_113802,5,0,0,ঈ
1,Train_104015,85,1,0,ন্ডা
2,Train_26874,134,1,0,শ্চা
3,Train_87607,151,7,0,স্থে
4,Train_167117,107,2,0,বি
...,...,...,...,...,...
200835,Train_45425,70,0,4,ত্ম্য
200836,Train_19447,136,0,2,র্শ্ব
200837,Train_67556,92,1,0,ন্না
200838,Train_189207,85,2,0,ন্ডি


adding a new column `kfold` and giving all value as `-1`

In [12]:
workingDataFrame.loc[:,'kfold'] = -1
workingDataFrame

Unnamed: 0,image_id,grapheme_root,vowel_diacritic,consonant_diacritic,grapheme,kfold
0,Train_113802,5,0,0,ঈ,-1
1,Train_104015,85,1,0,ন্ডা,-1
2,Train_26874,134,1,0,শ্চা,-1
3,Train_87607,151,7,0,স্থে,-1
4,Train_167117,107,2,0,বি,-1
...,...,...,...,...,...,...
200835,Train_45425,70,0,4,ত্ম্য,-1
200836,Train_19447,136,0,2,র্শ্ব,-1
200837,Train_67556,92,1,0,ন্না,-1
200838,Train_189207,85,2,0,ন্ডি,-1


In [18]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

x = workingDataFrame.image_id.values
y = workingDataFrame[['grapheme_root','vowel_diacritic','consonant_diacritic']].values

mskf = MultilabelStratifiedKFold(n_splits=5)

for fold, (train,validation) in enumerate(mskf.split(x,y)):
    # print("train : ",train," Validation : ",validation)
    workingDataFrame.loc[validation,"kfold"] = fold

# print(workingDataFrame.kfold.value_counts())
workingDataFrame.to_csv('../input/train_folds.csv',index=False)

train :  [     0      5      6 ... 200836 200838 200839]  Validation :  [     1      2      3 ... 200826 200827 200837]
train :  [     0      1      2 ... 200837 200838 200839]  Validation :  [     6     19     24 ... 200828 200832 200836]
train :  [     0      1      2 ... 200836 200837 200838]  Validation :  [     5     20     23 ... 200831 200834 200839]
train :  [     1      2      3 ... 200836 200837 200839]  Validation :  [     0      7     22 ... 200830 200835 200838]
train :  [     0      1      2 ... 200837 200838 200839]  Validation :  [    16     18     21 ... 200824 200829 200833]
3    40168
0    40168
2    40168
1    40168
4    40168
Name: kfold, dtype: int64
