In [7]:
import yaml

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
%cd /content/drive/MyDrive/Colab Notebooks/first_take/src

/content/drive/MyDrive/Colab Notebooks/first_take/src


In [10]:
with open('./config.yaml') as file:
    config = yaml.safe_load(file.read())

In [11]:
config

{'cat_negative_value': True,
 'cm_class': ['0', '1'],
 'custom_eval': None,
 'debug': False,
 'do_log_scale': True,
 'do_weight': False,
 'feval': None,
 'fname_cm': 'cm.png',
 'fname_importance': 'feature_importance.png',
 'fname_log_cv': 'log_cv',
 'fname_log_fold': 'log_fold',
 'fname_log_pp': 'log_pp',
 'fname_submission': 'submission.csv',
 'id': 'object_id',
 'input_dir': './src/input/',
 'input_dir_jn': '../src/input/',
 'input_dir_root': './input/',
 'input_dir_root_jn': '../input/',
 'model': 'lgbm',
 'num_class': 1,
 'num_folds': 5,
 'num_seed': 1,
 'optim_th': False,
 'output_dir': './src/output/',
 'output_dir_cv': './src/output/',
 'output_dir_fold': '../src/input/',
 'output_dir_jn': '../src/output/',
 'output_dir_pp': '../src/input/',
 'seed': 1996,
 'shuffle': False,
 'split_group': 'art_series_id',
 'split_target': 'likes',
 'split_type': 'group',
 'target': 'likes',
 'task': 'regression'}

In [12]:
#===========================================================
# Config
#===========================================================
import os

df_path_dict = {
    'train': os.path.join(config['input_dir_root_jn'], 'train.csv'),
    'test': os.path.join(config['input_dir_root_jn'], 'test.csv'),
    'sample_submission': os.path.join(config['input_dir_root_jn'], 'sample_submission.csv')
}

In [13]:
#!pip install torch

In [14]:
#===========================================================
# Library
#===========================================================
import gc
import json
import os
import random
import sys
import time
import warnings
from collections import Counter, defaultdict
from contextlib import contextmanager
from functools import partial
from logging import INFO, FileHandler, Formatter, StreamHandler, getLogger

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import torch
from sklearn.model_selection import (GroupKFold, GroupShuffleSplit, KFold,
                                     StratifiedKFold)
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [15]:
config['output_dir_jn']

'../src/output/'

In [16]:
#===========================================================
# Utils
#===========================================================

def seed_everything(seed=1996):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


@contextmanager
def timer(name):
    t0 = time.time()
    logger.info(f'[{name}] start')
    yield
    logger.info(f'[{name}] done in {time.time() - t0:.0f} s')
    logger.info('')


def get_logger(filename='log'):
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log", mode='w')
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

logger = get_logger(os.path.join(config['output_dir_jn'], config['fname_log_fold']))

def load_df(path, df_name, config):
    if path.split('.')[-1]=='csv':
        if config['debug']:
            df = pd.read_csv(path, nrows=1000)
        else:
            df = pd.read_csv(path)
    elif path.split('.')[-1]=='pkl':
        df = pd.read_pickle(path)
    logger.info(f"{df_name} shape / {df.shape} ")
    return df

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logger.info('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [17]:
#===========================================================
# Validation
#===========================================================
def stratified_group_k_fold(X, y, groups, k, seed=None):
    labels_num = np.max(y) + 1
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1

    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)
    
    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)
    for i in range(k):
        train_groups = all_groups - groups_per_fold[i]
        test_groups = groups_per_fold[i]

        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        test_indices = [i for i, g in enumerate(groups) if g in test_groups]

        yield train_indices, test_indices

def get_valid_splits(train_df, config):
    # Cross validation model
    if config['split_type'] == "group":
        if config['shuffle']:
            folds = GroupShuffleSplit(n_splits=config['num_folds'], random_state=config['seed'])
            group = LabelEncoder().fit_transform(train_df[config['split_group']].values)
            splits = folds.split(train_df, train_df[config['split_target']], group)
        else:
            folds = GroupKFold(n_splits=config['num_folds'])
            group = LabelEncoder().fit_transform(train_df[config['split_group']].values)
            splits = folds.split(train_df, train_df[config['split_target']], group)
    elif config['split_type'] == "stratified":
        folds = StratifiedKFold(n_splits=config['num_folds'], shuffle=config['shuffle'], random_state=config['seed'])
        splits = folds.split(train_df, train_df[config['split_target']])
    elif config['split_type'] == "kfold":
        folds = KFold(n_splits=config['num_folds'], shuffle=config['shuffle'], random_state=config['seed'])
        splits = folds.split(train_df, train_df[config['split_target']])
    elif config['split_type'] == "stratified_group":
        group = LabelEncoder().fit_transform(train_df[config['split_group']].values)
        splits = stratified_group_k_fold(train_df, train_df[config['split_target']], group, k=config['num_folds'], seed=config['seed'])
    else:
        raise(ValueError("such validation is not defined"))
    
    return splits

In [18]:
#===========================================================
# Make Folds
#===========================================================

def make_folds(train_df, config):
    # Create arrays and dataframes to store results
    seed_everything(config['seed'])

    splits = get_valid_splits(train_df, config)
    
    train_df['folds'] = np.nan

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(splits):
        train_df['folds'].iloc[valid_idx] = n_fold
    
    assert len(train_df) == sum(train_df['folds'].value_counts())
    train_df['folds'] = train_df['folds'].astype(int)

    return train_df

# Main

In [19]:
logger.info('Basic Config')
for k, v in config.items():
    logger.info('   {}: {}'.format(k, v))
logger.info('')

Basic Config
   debug: False
   task: regression
   input_dir_root: ./input/
   input_dir_root_jn: ../input/
   input_dir: ./src/input/
   input_dir_jn: ../src/input/
   output_dir: ./src/output/
   output_dir_jn: ../src/output/
   output_dir_fold: ../src/input/
   output_dir_pp: ../src/input/
   output_dir_cv: ./src/output/
   fname_submission: submission.csv
   fname_log_fold: log_fold
   fname_log_pp: log_pp
   fname_log_cv: log_cv
   fname_importance: feature_importance.png
   fname_cm: cm.png
   cm_class: ['0', '1']
   id: object_id
   target: likes
   seed: 1996
   num_folds: 5
   split_type: group
   split_group: art_series_id
   split_target: likes
   shuffle: False
   model: lgbm
   num_class: 1
   num_seed: 1
   do_log_scale: True
   cat_negative_value: True
   do_weight: False
   custom_eval: None
   feval: None
   optim_th: False



In [20]:
with timer('Data Loading'):
    train = load_df(path=df_path_dict['train'], df_name='train', config=config)
    test = load_df(path=df_path_dict['train'], df_name='train', config=config)
    gc.collect()

[Data Loading] start
train shape / (12026, 19) 
train shape / (12026, 19) 
[Data Loading] done in 1 s



In [21]:
train

Unnamed: 0,object_id,art_series_id,title,description,long_title,principal_maker,principal_or_first_maker,sub_title,copyright_holder,more_title,acquisition_method,acquisition_date,acquisition_credit_line,dating_presenting_date,dating_sorting_date,dating_period,dating_year_early,dating_year_late,likes
0,0011d6be41612ec9eae3,93c092ba70beab248f31,The Avenue of Birches,,"The Avenue of Birches, Jan Hackaert, 1660 - 1685",Jan Hackaert,Jan Hackaert,h 66.5cm × w 53.7cm × t 2.5cm × d 4.7cm,,The Avenue of Birches,purchase,1808-01-01T00:00:00,,1660 - 1685,1660.0,17,1660.0,1685.0,48
1,0012765f7a97ccc3e9e9,95c14fb11c54281ad7e0,Struik in bloei,,"Struik in bloei, Richard Tepe (attributed to),...",Richard Tepe,Richard Tepe,h 165mm × w 223mm,erven Richard Tepe,Struik in bloei,purchase,2000-01-01T00:00:00,,c. 1900 - c. 1930,1900.0,19,1900.0,1930.0,2
2,0017be8caa87206532cb,4c406890d208fe01f8fb,Portret van een vrouw,"Portret van eenvrouw, zittend naast een tafel.","Portret van een vrouw, Tresling & Comp., 1860 ...",Tresling & Comp.,Tresling & Comp.,h 87mm × w 56mm,,Portret van een vrouw,gift,2007-01-01T00:00:00,"Gift of M.M. Boom, Leiden",1860 - 1880,1860.0,19,1860.0,1880.0,5
3,00181d86ff1a7b95864e,fa0891535500a4973db2,A St Bernard Dog,"Een sint-bernardshond, staand in een landschap...","A St Bernard Dog, Bernard te Gempt, c. 1850 - ...",Bernard te Gempt,Bernard te Gempt,h 179cm × w 248cm × t 4cm,,A St Bernard Dog,bequest,1881-01-01T00:00:00,"J. Hilman Bequest, Amsterdam",c. 1850 - c. 1879,1850.0,19,1850.0,1879.0,100
4,001c52ae28ec106d9cd5,8065ed5063c5887e677d,Woelige zee met zeilschepen,Woelige zee met zeilschepen.,"Woelige zee met zeilschepen, anonymous, 1825 -...",anonymous,anonymous,h 13cm × w 17.5cm × d 0.7cm,,Woelige zee met zeilschepen,unknown,1971-01-01T00:00:00,,1825 - 1874,1825.0,19,1825.0,1874.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12021,ffedf8af4fd5b3873164,2e4695e7f3260d52b3e6,De schilder H.W. Mesdag voor een doek,,"De schilder H.W. Mesdag voor een doek, Delboy ...",Delboy & Baer,Delboy & Baer,h 90mm × w 140mm,,De schilder H.W. Mesdag voor een doek H W Mesdag,transfer,1994-01-01T00:00:00,,1900 - 1920,1900.0,19,1900.0,1920.0,4
12022,ffee34705ea44e1a0f79,7aa656a9ef243d93d009,"Kaart van de streek tussen Mannheim en Landau,...","Kaart van de streek tussen Mannheim, Speyer en...","Kaart van de streek tussen Mannheim en Landau,...",Abraham Allard,Abraham Allard,h 245mm × w 278mm,,"Kaart van de streek tussen Mannheim en Landau,...",purchase,1881-01-01T00:00:00,,1701 - 1714,1701.0,18,1701.0,1714.0,0
12023,ffefbe1faf771aa4f790,e79c2e74ed17533a7e56,Storm op het IJ aan het Blauwhoofd van Amsterd...,Zeilschepen in de problemen op het IJ ter hoog...,Storm op het IJ aan het Blauwhoofd van Amsterd...,Noach van der Meer (II),Noach van der Meer (II),h 217mm × w 306mm,,Storm op het IJ aan het Blauwhoofd van Amsterd...,transfer,1887-01-01T00:00:00,,1778,1778.0,18,1778.0,1778.0,0
12024,fff08e76cbb969eaddc7,510358b74c1104edbbbd,"Het rad van fortuin, ca. 1689","Het rad van fortuin, ca. 1689. Spotprent op de...","Het rad van fortuin, ca. 1689, Romeyn de Hoogh...",Romeyn de Hooghe,Romeyn de Hooghe,h 560mm × w 405mm,,"Het rad van fortuin, ca. 1689 'T Hedendaags Ra...",purchase,1881-01-01T00:00:00,,1689 - 1690,1689.0,17,1689.0,1690.0,14


In [22]:
with timer('Split Folds'):
    train_folds = make_folds(train, config)

[Split Folds] start
[Split Folds] done in 0 s



In [23]:
logger.info(train_folds[[config['id'], config['target'], 'folds']])
logger.info(train_folds['folds'].value_counts())

                  object_id  likes  folds
0      0011d6be41612ec9eae3     48      0
1      0012765f7a97ccc3e9e9      2      0
2      0017be8caa87206532cb      5      2
3      00181d86ff1a7b95864e    100      2
4      001c52ae28ec106d9cd5      7      1
...                     ...    ...    ...
12021  ffedf8af4fd5b3873164      4      2
12022  ffee34705ea44e1a0f79      0      1
12023  ffefbe1faf771aa4f790      0      2
12024  fff08e76cbb969eaddc7     14      1
12025  fff1d87d79953ddab2c6    171      0

[12026 rows x 3 columns]
0    2406
4    2405
3    2405
2    2405
1    2405
Name: folds, dtype: int64


In [24]:
with timer('Save folds.csv'):
    train_folds[[config['id'], config['target'], 'folds']].to_csv(config['input_dir_jn']+'folds.csv', index=False)

[Save folds.csv] start
[Save folds.csv] done in 0 s



In [25]:
config['input_dir_jn']

'../src/input/'