In [1]:
import cudf
cudf.set_allocator("managed")
import pandas as pd
import cupy
import xgboost as xgb
import numpy as np
from tqdm import tqdm
cudf.__version__

'22.06.00a+319.g97422602b8'

In [2]:
def get_not_used():
    # cid is the label encode of customer_ID
    # row_id indicates the order of rows
    return ['row_id', 'customer_ID', 'target', 'cid', 'S_2']
    
def preprocess(df):
    df['row_id'] = cupy.arange(df.shape[0])
    not_used = get_not_used()
    cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
                'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

    for col in df.columns:
        if col not in not_used+cat_cols:
            df[col] = df[col].round(2)

    # compute "after pay" features
    for bcol in [f'B_{i}' for i in [11,14,17]]+['D_39','D_131']+[f'S_{i}' for i in [16,23]]:
        for pcol in ['P_2','P_3']:
            if bcol in df.columns:
                df[f'{bcol}-{pcol}'] = df[bcol] - df[pcol]

    df['S_2'] = cudf.to_datetime(df['S_2'])
    df['cid'], _ = df.customer_ID.factorize()
        
    num_cols = [col for col in df.columns if col not in cat_cols + not_used]
    
    dgs = add_stats_step(df, num_cols)
        
    # cudf merge changes row orders
    # restore the original row order by sorting row_id
    df = df.sort_values('row_id')
    df = df.drop(['row_id'],axis=1)
    return df, dgs

def add_stats_step(df, cols):
    n = 50
    dgs = []
    for i in range(0,len(cols),n):
        s = i
        e = min(s+n, len(cols))
        dg = add_stats_one_shot(df, cols[s:e])
        dgs.append(dg)
    return dgs

def add_stats_one_shot(df, cols):
    stats = ['mean','std']
    dg = df.groupby('customer_ID').agg({col:stats for col in cols})
    out_cols = []
    for col in cols:
        out_cols.extend([f'{col}_{s}' for s in stats])
    dg.columns = out_cols
    dg = dg.reset_index()
    return dg

def load_data(df):    
    df = process_data(df)
    return df

def process_data(df):
    df,dgs = preprocess(df)
    df = df.drop_duplicates('customer_ID',keep='last')
    for dg in dgs:
        df = df.merge(dg, on='customer_ID', how='left')
    diff_cols = [col for col in df.columns if col.endswith('_diff')]
    df = df.drop(diff_cols,axis=1)
    return df

In [None]:
train_data = cudf.read_parquet('Data/train.parquet')
train_data.shape

In [None]:
train = load_data(train_data)
train.shape

In [None]:
train.drop(columns=['cid', 'S_2'], axis=1, inplace=True)
train.set_index('customer_ID', inplace=True)
train_data_pandas = train.to_pandas()
train_data_pandas.to_pickle('Data/train_data_aggV3.pkl')

In [3]:
test_data = cudf.read_parquet('Data/test.parquet')
test_data.shape

(11363762, 190)

In [4]:
test = load_data(test_data)
test.shape

(924621, 587)

In [5]:
test.drop(columns=['cid', 'S_2'], axis=1, inplace=True)
test.set_index('customer_ID', inplace=True)
test.to_parquet('Data/test_data_aggV3.parquet')

In [13]:
test_data_pandas = test.to_pandas()
test_data_pandas.to_pickle('Data/test_data_aggV3.pkl')