In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import sys
import gc

In [2]:
def find_tresh_val(feature, tresholds):
    tresh_val = []
    for i in range(len(tresholds)-1):
        a = tresholds[i]
        b = tresholds[i+1]
        mean_val = df[(df[feature] > a) & (df[feature] < b)][feature].mean()
        tresh_val.append(mean_val)
    return tresh_val

def convert(x, feature, tresholds: list, tresh_val: list):
    for i in range(len(tresholds)-1):
        a = tresholds[i]
        b = tresholds[i+1]
        if (x > a and x < b):
            return round(tresh_val[i], 1)
        
def last_day(date):
    time_interval = date.split('-')
    year, month, day = int(time_interval[0]), int(time_interval[1]), int(time_interval[2])
    if (day == calendar.monthrange(year, month)[1]):
        return 1
    return 0

def next_month(date):
    time_interval = date.split('-')
    year, month, day = int(time_interval[0]), int(time_interval[1]), time_interval[2]
    month = month+1
    if month == 13:
        year, month = year+1, 1
    if ((month == 2) and (int(day)>28)):
        day = 28
    if((month == 4 or month == 6 or month == 9 or month == 11) and (int(day)==31)):
        day = 30
    if int(month/10) == 0:
        month = '0'+str(month)
    new_date = [str(year), str(month), str(day)]
    return '-'.join(new_date)

def transform_date(customer):
    payment_history_val = customer.S_2.values
    payment_history = pd.Series(payment_history_val)
    res = [payment_history.values[0]]
    for i in range(payment_history.shape[0]-1):
        res.append(next_month(res[i]))
    ser = pd.Series(res, copy=False)
    actual_date = pd.to_datetime(payment_history)
    desirable_date = pd.to_datetime(ser)
    mean_diff = 0
    for diff in (actual_date - desirable_date).iloc:
        mean_diff += diff.days
    mean_diff = mean_diff/payment_history.shape[0]
    def sigmoid(x):
        return 1 / (1 + np.exp(-0.5*x))
    return sigmoid(mean_diff)

In [3]:
cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']

columns_to_del = ['D_42', 'D_43', 'D_46', 'D_49', 'D_50', 'D_53', 'D_56', 'S_9', 'B_17',
                  'D_66', 'D_73', 'D_76', 'D_77', 'R_9', 'D_82', 'B_29', 'D_87', 'D_88',
                  'D_105', 'D_106', 'R_26', 'D_108', 'D_110', 'D_111', 'B_39', 'S_27', 
                  'B_42', 'D_132', 'D_134', 'D_135', 'D_136', 'D_137', 'D_138', 'D_142']

In [4]:
tresholds_for_features = dict({
                               'R_2'  : [0, 0.3, 1.3],
                               'B_8'  : [-0.1, 0.3, 1.3],
                               'S_6'  : [0, 0.3, 1.3],
                               'D_54' : [-0.1, 0.3, 1.3],
                               'R_4'  : [0, 0.3, 1.3], 
                               'P_4'  : [0, 0.3, 1.3], 
                               'B_33' : [0, 0.3, 1.3], 
                               'D_103' : [0, 0.3, 1.3], 
                               'D_104' : [0, 0.3, 1.3], 
                               'R_27'  : [0, 0.3, 1.3],  
                               'D_112' : [0, 0.3, 1.3], 
                               'D_123' : [0, 0.3, 1.3], 
                               'D_127' : [0, 0.3, 1.3], 
                               'D_128' : [0, 0.3, 1.3], 
                               'D_129' : [0, 0.3, 1.3], 
                               'D_130' : [0, 0.3, 1.3], 
                               'D_131' : [0, 0.3, 1.3], 
                               'D_139' : [0, 0.3, 1.3], 
                               'D_141' : [0, 0.3, 1.3], 
                               'D_143' : [0, 0.3, 1.3],
                               'D_51' : [0, 0.2, 0.5,0.8, 1.1], 
                               'B_22' : [0, 0.2, 0.7],
                               'D_70' : [0, 0.2, 0.4, 0.6, 0.8],
                               'D_79' : [0, 0.2, 0.7],
                               'D_80' : [0, 0.2, 0.4, 0.6, 0.8],
                               'R_12' : [-1, 0.4, 2],
                               'D_91' : [0, 0.2, 0.7],
                               'D_92' : [0, 0.5, 1.5],
                               'D_107': [0, 0.2, 0.5, 0.8, 1.2],
                               'D_113': [0, 0.1, 0.3, 0.5, 0,7, 0.9]
                               
})

In [5]:
def quantile(x, q):
    return np.quantile(x, q)

quantile_025 = lambda x: quantile(x,0.25)
quantile_025.__name__ = '025'

quantile_075 = lambda x: quantile(x,0.75)
quantile_075.__name__ = '075'

In [6]:
chunks = pd.read_csv('train_data.csv', chunksize=32768, encoding = 'utf-8', sep = ',')

In [7]:
%%time
for i in tqdm(range(169)):
    cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']
    next_chunk = chunks.__next__()
    df = next_chunk.drop(columns_to_del, axis = 1)
    del(next_chunk)
    _ = gc.collect()
    for feature in tresholds_for_features:
        tresh_val = find_tresh_val(feature, tresholds_for_features.get(feature))
        df[feature] = df[feature].apply(lambda x: convert(x, feature, tresholds_for_features.get(feature), tresh_val))
    cat_features += list(tresholds_for_features.keys())
    res = []
    for feature in cat_features:
        res.append(df[feature].mode().values[0])
    global_modes_val = dict(zip(cat_features, res))
    customers = df.customer_ID.unique()
    clear_cat = pd.DataFrame()
    for customer in customers:
        customer_data_cat = df[df['customer_ID'] == customer][cat_features]
        if (1 in customer_data_cat.isna().mean().values):
            clear_cat = pd.concat([clear_cat, customer_data_cat.fillna(value=global_modes_val)])
        elif customer_data_cat.shape[0] >= 1:
            values = dict(zip(cat_features, customer_data_cat.mode().values[0]))
            clear_cat = pd.concat([clear_cat, customer_data_cat.fillna(value=values)])
    df[cat_features] = clear_cat
    continuous_features = df.drop(cat_features+['customer_ID', 'S_2'], axis=1).columns.to_list()
    res = []
    for feature in continuous_features:
        res.append(df[feature].median())
    continuous_val = dict(zip(continuous_features, res))
    df = df.fillna(value=continuous_val)
    new_S_2 = []
    for customer in customers:
        customer_data = df[df['customer_ID'] == customer]
        new_S_2.append(transform_date(customer_data))
    categ_df = pd.get_dummies(df[cat_features], columns=cat_features)
    categ_df['customer_ID'] = df['customer_ID']
    categ_df = categ_df.groupby('customer_ID').agg('sum').reset_index()
    functions = [['mean', 'std', 'max', 'min', 'first', 'last', quantile_025, quantile_075]]*len(continuous_features)
    dict_agg = dict(zip(continuous_features, functions))
    df_con = df.groupby('customer_ID').agg(dict_agg).reset_index()
    df_con.columns = ['customer_ID']+[c[0]+'_'+c[1] for c in df_con.columns[1:]]
    df_con = df_con.fillna(0)
    df_con['S_2'] = new_S_2
    del df
    _ = gc.collect()
    clear_df = df_con.merge(categ_df, on='customer_ID')
    del df_con, categ_df, clear_cat
    _ = gc.collect()
    clear_df.to_csv('train/'+str(i)+'th_chunk.csv', sep=';', index=False)

100%|█████████████████████████████████████████████████████████████████████████████| 169/169 [7:53:44<00:00, 168.19s/it]

Wall time: 7h 53min 44s



