In [2]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import json
import os
import datetime as dt
from tqdm import tqdm
import pandas as pd
import numpy as np

In [3]:
OUT_DIR_PATH = 'dataset/train_simplified_93nature_shuffle' # The output dir of processed suffle dataset.
CSV_DIR = "dataset/train_simplified_93nature/"             # The original categories dataset which is CSV files with label name, for example dataset/train_simplified.
NCSVS = 100                                                # Split the shufled data into NCSVS part, normally this no need to change.

In [4]:
def f2cat(filename: str) -> str:
    return filename.split('.')[0]

class Simplified():
    def __init__(self, input_path='dataset/train_simplified/'):
        self.input_path = input_path

    def list_all_categories(self):
        #files = os.listdir(os.path.join(self.input_path, 'train_simplified'))
        files = os.listdir(os.path.join(self.input_path))
        return sorted([f2cat(f) for f in files], key=str.lower)

    def read_training_csv(self, category, nrows=None, usecols=None, drawing_transform=False):
        df = pd.read_csv(os.path.join(self.input_path, category + '.csv'),
                         nrows=nrows, parse_dates=['timestamp'], usecols=usecols)
        if drawing_transform:
            df['drawing'] = df['drawing'].apply(json.loads)
        return df

In [5]:
start = dt.datetime.now()
s = Simplified(CSV_DIR)
categories = s.list_all_categories()
print("Total categories: {}".format(len(categories)))

with open('categories.txt', 'w') as f:
    f.write('\n'.join(categories))

Total categories: 93


628

In [6]:

for y, cat in tqdm(enumerate(categories)):
    #df = s.read_training_csv(cat, nrows=30000)
    df = s.read_training_csv(cat, nrows=None)
    df['y'] = y
    df['cv'] = (df.key_id // 10 ** 7) % NCSVS
    for k in range(NCSVS):
        filename = 'train_k{}.csv'.format(k)
        chunk = df[df.cv == k]
        chunk = chunk.drop(['key_id'], axis=1)
        if y == 0:
            chunk.to_csv(os.path.join(OUT_DIR_PATH, filename), index=False)
        else:
            chunk.to_csv(os.path.join(OUT_DIR_PATH, filename), mode='a', header=False, index=False)

0it [00:00, ?it/s]

93it [03:00,  1.94s/it]


In [7]:
for k in tqdm(range(NCSVS)):
    filename = 'train_k{}.csv'.format(k)
    filename = os.path.join(OUT_DIR_PATH, filename)
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        df['rnd'] = np.random.rand(len(df))
        df = df.sort_values(by='rnd').drop('rnd', axis=1)
        df.to_csv(filename + '.gz', compression='gzip', index=False)
        os.remove(filename)
print(df.shape)

100%|██████████| 100/100 [55:06<00:00, 33.07s/it]

(134865, 7)



