In [216]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from tqdm import tqdm
from  datetime import datetime, timedelta
from sklearn.metrics import accuracy_score
import pickle

config = {
    'READ_DATASETS_FULL': False,
    'LOAD_DATASETS_FULL': True,
    'ReGen': False
    
}
legal_tag = ['2','6','10','12','13','15','18','19','21','22','25','26','36','37','39','48']

In [217]:
if config['READ_DATASETS_FULL']:
    df = pd.read_csv('tbrain_cc_training_48tags_hash_final.csv')
    #!pip install --upgrade tables
    print(df.shape)
    #df.to_hdf('df_all.hdf',key = 's',mode='w')


In [218]:
!cat /proc/sys/vm/overcommit_memory

1


In [219]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            elif str(col_type)[:5] == 'float':
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [220]:
# Legal_tag: tags that output required
if config['LOAD_DATASETS_FULL']:
    #df = pd.read_hdf('df_all.hdf', 's')
    df = pd.read_hdf('reduced_all.hdf', 's')
    print('origin data numbers: ', len(df))
    df = reduce_mem_usage(df, verbose=True)
    #df.to_hdf('reduced_all.hdf',key = 's',mode='w')


origin data numbers:  32975653
Mem. usage decreased to 4088.24 Mb (0.0% reduction)


In [221]:
for col in df.columns:
        print(df[col].dtypes)

int8
int32
object
int16
float64
int16
int16
int8
int16
float16
float16
float16
float16
int16
int16
int16
int16
int16
int16
int16
int16
int16
int16
int16
int16
int16
int8
int16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float64
float16
float16
int8


In [222]:
df2 = df[df['shop_tag'].isin(legal_tag)]
#df2=df.copy()
print('data after reduction: ', len(df2))

data after reduction:  22130579


In [223]:
def gen_data(df, ID):
   
    df_chid = df[df['chid'] == ID]
    # sort by txn_amount(descending) and dt(ascending)
    df_chid = df_chid.sort_values(['dt', 'txn_amt'], ascending=[True, False])
    # sort dt 
    dt_list = list(set(df_chid.dt))
    dt_list = sorted(dt_list)
    df_train = pd. DataFrame() 
    # ============feature extraction ==============
    new_column = list(df_chid.keys())
    tag_cnt_col = ['tag' + str(l) + 'cnt' for l in legal_tag]
    tag_amt_col = ['tag' + str(l) + 'amt' for l in legal_tag]
    new_column.extend(tag_cnt_col)
    new_column.extend(tag_amt_col)
    new_column.append('label')
    #print(new_column)
    df_all = []
    for idx, dt in enumerate(dt_list):
        # label: next month largest shop tag
        df_dt = df_chid[df_chid['dt'] == dt] 
        if idx != len(dt_list) - 1:
            label = df_chid[df_chid['dt'] == dt_list[idx+1]].shop_tag.values[0]
        amt_total = df_dt['txn_amt'].sum()
        cnt_total = df_dt['txn_cnt'].sum()
        tag_history = list(df_dt.shop_tag.values)
        amt = list(df_dt.txn_amt.values)
        cnt = list(df_dt.txn_cnt.values)
        tag_amt = dict(zip(tag_history, amt))
        tag_cnt = dict(zip(tag_history, cnt))
        # merge
        df_dt =  df_dt.groupby('dt', as_index=False).mean()
        for l in legal_tag:
            if l in tag_history:
                df_dt['tag_' + str(l) + '_cnt'] = tag_cnt[l]
                df_dt['tag_' + str(l) + '_amt'] = tag_amt[l]
            else:
                df_dt['tag_' + str(l) + '_cnt'] = 0
                df_dt['tag_' + str(l) + '_amt'] = 0
        #txn_amt -> total amt
        df_dt['txn_amt'] = amt_total
        #txn_cnt -> total cnt
        df_dt['txn_cnt'] = cnt_total
        if idx != len(dt_list) - 1:
            df_dt['label'] = label
        df_all.append(df_dt)
    if len(df_all) > 1:
        df_train = pd.concat(df_all[:-1]).reset_index(drop = True)
    
    return df_train, df_all[-1]

In [224]:
def train_age_model(df_train, age):
    print('--- START TRAINING AGE' + str(age) + ' ---')
    labelencoder = LabelEncoder()
    df_train['label'] = labelencoder.fit_transform(df_train['label'])
    print(labelencoder.classes_)
    df_train = df_train.fillna(-1)
    X_train, X_test, y_train, y_test = train_test_split(df_train.drop('label', axis = 'columns'), df_train['label'], test_size = 0.1)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    # score
    model = xgb.XGBClassifier()
    model.fit(X_train, y_train)
    
    file_name = "./model/xgb_age" + str(age) + ".pkl"
    # save
    pickle.dump(model, open(file_name, "wb"))
    y_pred = model.predict_proba(X_test)
    pred = model.predict(X_test)
    print(model.classes_)
    y_pred = np.array(y_pred)
    predictions = [model.classes_[value.argsort()[-3:]] for value in y_pred]

    accuracy = accuracy_score(y_test, pred)
    print('Top1 accuracy: ',accuracy)
    accuracy = [1 for yhat, y in zip(predictions, y_test) if y in yhat]
    print(len(accuracy)/len(y_test))

In [None]:
#eval
for age in range(1, 9):
    print('--- Age ' + str(age) + ' ---')
    trains = []
    tests = []
    df_age = df2[df2['age'] == age]
    chid_a1 = list(set(df_age.chid.values))
    if config['ReGen']:
        for ID in tqdm(chid_a1):
            try:
            #print(ID)
                train, test = gen_data(df_age, ID)
                trains.append(train)
                tests.append(test)
            except:
                print(ID)
                break;
        df_train = pd.concat(trains).reset_index(drop=True)
        df_test = pd.concat(tests).reset_index(drop=True)  
        df_train.to_pickle("./save/df_train" + str(age)+ ".pkl")
        df_test.to_pickle("./save/df_test" + str(age) + ".pkl")
    else:
        df_train = pd.read_pickle("./save/df_train" + str(age)+ ".pkl")
        df_test = pd.read_pickle("./save/df_test" + str(age)+ ".pkl")
        print('--- FINISHED LOADING DATA ---')
    df_train = reduce_mem_usage(df_train)
    print('TRAIN DATA SHAPE: ', df_train.shape)
    if True:
        train_age_model(df_train, age)
    

In [214]:
def predict_answer(CSV=True):
    #load model
    print('START LOADING MODEL ...')
    model1 = pickle.load(open("model/xgb_age1.pkl", "rb"))
    model2 = pickle.load(open("model/xgb_age2.pkl", "rb"))
    model3 = pickle.load(open("model/xgb_age3.pkl", "rb"))
    model4 = pickle.load(open("model/xgb_age4.pkl", "rb"))
    model5 = pickle.load(open("model/xgb_age5.pkl", "rb"))
    model6 = pickle.load(open("model/xgb_age6.pkl", "rb"))
    model7 = pickle.load(open("model/xgb_age7.pkl", "rb"))
    model8 = pickle.load(open("model/xgb_age8.pkl", "rb"))
    print("FINISHED LOADING MODEL")
    #reverse features
    en = ['10' ,'12', '13', '15', '18' ,'19' ,'2', '21', '22', '25' ,'26', '36', '37', '39', '48','6']
    ans = dict(zip(np.arange(16), en))
    sub = pd.read_csv('submit.csv')
    sub_id = list(sub.chid.values)
    ttl_test = []
    for i in range(1, 9):
        test = pd.read_pickle("./save/df_test" + str(i)+ ".pkl")
        ttl_test.append(test)
    ttl_test = pd.concat(ttl_test).reset_index(drop = True)
    users = list(ttl_test.chid.values)
    
    # predict answers
    CHID = []
    TOP = []

    for age in range(1, 10):
        test = ttl_test[ttl_test['age'] == age]
        ID = list(test.chid.values)
        scaler = StandardScaler()
        test = scaler.fit_transform(test)
        if age ==1:
            predict = model1.predict_proba(test)
        elif age ==2:
            predict = model2.predict_proba(test)
        elif age ==3:
            predict = model3.predict_proba(test)
        elif age ==4:
            predict = model4.predict_proba(test)
        elif age ==5:
            predict = model5.predict_proba(test)
        elif age ==6:
            predict = model6.predict_proba(test)
        elif age ==7:
            predict = model7.predict_proba(test)
        elif age ==8 or age == 9:
            predict = model8.predict_proba(test)
        predict = [model1.classes_[pred.argsort()[-3:]] for pred in predict]
        predict = np.array(predict)
        print('Age' + str(age)+ '_test_shape: ',  predict.shape)
        CHID.extend(ID) 
        TOP.extend(predict)
    #
    T1 = [ans[e[2]] for e in TOP]
    T2 = [ans[e[1]] for e in TOP]
    T3 = [ans[e[0]] for e in TOP]
    ID_t1 = dict(zip(CHID, T1))
    ID_t2 = dict(zip(CHID, T2))
    ID_t3 = dict(zip(CHID, T3))
    #
    b = set(users) & set(sub_id)
    i = set(sub_id) - set(users) 
    submit_final = pd.DataFrame({'chid':sub_id})
    
    #improve!!
    submit_final['top1'] = submit_final['chid'].apply(lambda t: ID_t1[t] if t not in i else 15)
    submit_final['top2'] = submit_final['chid'].apply(lambda t: ID_t2[t] if t not in i else 48)
    submit_final['top3'] = submit_final['chid'].apply(lambda t: ID_t3[t] if t not in i else 37)
    #
    print('top1 data: ', len(submit_final['top1'].values))
    print('top2 data: ', len(submit_final['top2'].values))
    print('top3 data: ', len(submit_final['top3'].values))
    if CSV:
        submit_final.to_csv('ans2.csv', index = False) 
    print("----FINISHED GOOD LUCK----")

In [215]:
predict_answer(CSV=True)

START LOADING MODEL ...
FINISHED LOADING MODEL
Age1_test_shape:  (434, 3)
Age2_test_shape:  (74260, 3)
Age3_test_shape:  (138779, 3)
Age4_test_shape:  (130307, 3)
Age5_test_shape:  (92205, 3)
Age6_test_shape:  (45171, 3)
Age7_test_shape:  (9051, 3)
Age8_test_shape:  (889, 3)
Age9_test_shape:  (29, 3)
top1 data:  500000
top2 data:  500000
top3 data:  500000
----FINISHED GOOD LUCK----


In [208]:
submit_final[submit_final['chid'] == 10128239]

Unnamed: 0,chid,top1,top2,top3
0,10128239,10,2,25
