# 0. Preprocessing

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import math

df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

dep_ls = df['DepartmentDescription'].unique()

# re-categorize dept
traincolumn = list(df["DepartmentDescription"].unique())
WEARls = [x for x in traincolumn if 'WEAR' in str(x) or 'SOCKS' in str(x) or 'SHOES' in str(x)
          or 'MATERNITY' in str(x)]
FOODls = [x for x in traincolumn if 'FOOD' in str(x) or 'MEAT' in str(x) or 'DAIRY' in str(x) or 'GROCERY' in str(x)
          or 'PRODUCE' in str(x) or "BREAD" in str(x) or 'BAKERY' in str(x) or 'WINE' in str(x) or "DELI" in str(x)
          or 'COOKIE' in str(x) or 'COOK' in str(x)]
ACCls = [x for x in traincolumn if 'SUNGLASS' in str(x) or 'OPTICAL' in str(x)]
ELECls = [x for x in traincolumn if 'ELECTRONICS' in str(x) or 'CAMERAS' in str(x) or 'MEDIA' in str(x)
          or 'WIRELESS' in str(x) or 'HARDWARE' in str(x)]
COSls = [x for x in traincolumn if 'PERSONAL' in str(x) or 'BEAUTY' in str(x) or 'PHARMACY' in str(x)
         or 'BATH' in str(x)]
HOUSEls = [x for x in traincolumn if 'PAPER' in str(x) or 'HOME' in str(x) or "BEDDING" in str(x)
           or 'HOUSE' in str(x) or 'CELE' in str(x) or 'OFFICE' in str(x)]
GARDENls = [x for x in traincolumn if 'GARDEN' in str(x) or 'HORTI' in str(x)]
INFANls = [x for x in traincolumn if 'INFANT' in str(x)]

traindf = df.copy()
testdf = test.copy()

traindf.loc[traindf['DepartmentDescription'].isin(INFANls), "DD_big"] = "INFAN"
traindf.loc[traindf['DepartmentDescription'].isin(GARDENls), "DD_big"] = "GARDEN"
traindf.loc[traindf['DepartmentDescription'].isin(HOUSEls), "DD_big"] = "HOUSE"
traindf.loc[traindf['DepartmentDescription'].isin(COSls), "DD_big"] = "COS"
traindf.loc[traindf['DepartmentDescription'].isin(ELECls), "DD_big"] = "ELEC"
traindf.loc[traindf['DepartmentDescription'].isin(ACCls), "DD_big"] = "ACC"
traindf.loc[traindf['DepartmentDescription'].isin(FOODls), "DD_big"] = "FOOD"
traindf.loc[traindf['DepartmentDescription'].isin(WEARls), "DD_big"] = "WEAR"

testdf.loc[testdf['DepartmentDescription'].isin(INFANls), "DD_big"] = "INFAN"
testdf.loc[testdf['DepartmentDescription'].isin(GARDENls), "DD_big"] = "GARDEN"
testdf.loc[testdf['DepartmentDescription'].isin(HOUSEls), "DD_big"] = "HOUSE"
testdf.loc[testdf['DepartmentDescription'].isin(COSls), "DD_big"] = "COS"
testdf.loc[testdf['DepartmentDescription'].isin(ELECls), "DD_big"] = "ELEC"
testdf.loc[testdf['DepartmentDescription'].isin(ACCls), "DD_big"] = "ACC"
testdf.loc[testdf['DepartmentDescription'].isin(FOODls), "DD_big"] = "FOOD"
testdf.loc[testdf['DepartmentDescription'].isin(WEARls), "DD_big"] = "WEAR"

# re-categorize FinelineNumber
traindf['dept_fine'] = traindf.apply(lambda x: str(x['DepartmentDescription']) + '_' + str(x['FinelineNumber']), axis=1)
testdf['dept_fine'] = testdf.apply(lambda x: str(x['DepartmentDescription']) + '_' + str(x['FinelineNumber']), axis=1)

df_fine = traindf.groupby(['TripType', 'dept_fine'], as_index=False).agg(sum)
df_fine = df_fine.pivot(index='dept_fine', columns='TripType', values='ScanCount')

# 1. Weight1: based on dept. and FinelineNumber

In [8]:
df_weight = pd.DataFrame(index=df_fine.index, columns=df_fine.columns)
for i in range(len(df_fine.columns)):
    df_weight.iloc[:, i] = df_fine.iloc[:, i] / np.sum(abs(df_fine.iloc[:, i]))

- X axis: TripType
- Y axis: dept and FinelineNumber

In [9]:
df_weight.fillna(0)

TripType,3,4,5,6,7,8,9,12,14,15,...,36,37,38,39,40,41,42,43,44,999
dept_fine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-HR PHOTO_110.0,0.000137,0.000000,0.000000,0.000000,0.000037,0.000000,0.001351,0.000000,0.0,0.000000,...,0.000122,0.000000,0.000029,0.000000,0.000000,0.000000,0.000092,0.000143,0.000000,0.000000
1-HR PHOTO_120.0,0.000000,0.000000,0.000064,0.000000,0.000000,0.000000,0.001351,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000138,0.000000,0.000000,0.000000
1-HR PHOTO_130.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000117,0.000000,0.0,0.000222,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000046,0.000000,0.000000,0.000000
1-HR PHOTO_141.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1-HR PHOTO_150.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000999,0.000000,0.0,0.000111,...,0.000286,0.000000,0.000115,0.000000,0.000005,0.000165,0.000046,0.000000,0.000000,-0.000249
1-HR PHOTO_160.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000117,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1-HR PHOTO_170.0,0.000137,0.000000,0.000000,0.000000,0.000000,0.000000,0.000646,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1-HR PHOTO_180.0,0.000000,0.000000,0.000000,0.001265,0.000000,0.000000,0.000176,0.000000,0.0,0.000111,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.002493
1-HR PHOTO_190.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000059,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000046,0.000000,0.000000,0.000000
1-HR PHOTO_8021.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000411,0.000000,0.0,0.000222,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000997


##  1.1. Apply weight to each VisitNumber
### step1: join

In [11]:
train_val = traindf.groupby(['VisitNumber', 'dept_fine'], as_index=False).agg(sum)
#train_val.pivot(index='VisitNumber', columns='dept_fine', values='ScanCount')
train_val = train_val[['VisitNumber', 'dept_fine', 'TripType', 'ScanCount']]
train_join = train_val.join(df_weight, on='dept_fine')
train_join.head(1)

Unnamed: 0,VisitNumber,dept_fine,TripType,ScanCount,3,4,5,6,7,8,...,36,37,38,39,40,41,42,43,44,999
0,5,FINANCIAL SERVICES_1000.0,999,-1,0.063976,,,,,,...,,,,,,,,,,0.006979


### step2: multiply `ScanCount`

In [14]:
for i in range(len(df_fine.columns)):
    train_join.iloc[:, 4+i] = train_join['ScanCount'] * train_join.iloc[:, 4+i]
train_join=train_join.fillna(0)
train_join.head(1)

Unnamed: 0,VisitNumber,dept_fine,TripType,ScanCount,3,4,5,6,7,8,...,36,37,38,39,40,41,42,43,44,999
0,5,FINANCIAL SERVICES_1000.0,999,-1,-0.063976,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.006979


### step3: join TripType(label)

In [161]:
temp = df[['VisitNumber', 'TripType']].drop_duplicates()
temp.tail()

Unnamed: 0,VisitNumber,TripType
647010,191343,25
647017,191344,22
647022,191345,39
647035,191346,39
647052,191347,8


### Train data

In [178]:
train_join2 = train_join.groupby('VisitNumber').agg(sum).drop(columns=['ScanCount', 'TripType'])
train_join2 = train_join2.reset_index()
train_join2 = train_join2.merge(temp, on='VisitNumber', how='left')
train_join2.tail()

Unnamed: 0,VisitNumber,3,4,5,6,7,8,9,12,14,...,37,38,39,40,41,42,43,44,999,TripType
95669,191343,0.00151,0.013474,0.00874,0.009611,0.007095,0.035698,0.013275,0.003603,0.012048,...,0.002258,0.004227,0.006251,0.003176,0.013225,0.008975,0.009712,0.006526,0.001828,25
95670,191344,0.000412,0.0,0.000193,0.0,0.0,0.002037,0.000352,0.0,0.0,...,0.0,8.6e-05,0.000395,0.000307,0.000165,0.00069,0.0,0.000816,-0.000914,22
95671,191345,0.000686,0.0,0.002057,0.002276,0.01249,0.00781,0.000646,0.004003,0.0,...,0.007326,0.016592,0.011261,0.013599,0.004133,0.002163,0.002714,0.004165,-0.000415,39
95672,191346,0.0,0.14052,0.007134,0.00784,0.022689,0.008404,0.001762,0.024019,0.0,...,0.0226,0.027434,0.017286,0.02075,0.004298,0.003314,0.008283,0.0076,0.000914,39
95673,191347,0.0,0.0,0.001028,0.000253,0.001589,0.003353,0.000235,0.002802,0.0,...,0.003479,0.019065,0.003215,0.005085,0.001323,0.000736,0.001571,0.001889,-0.000166,8


In [121]:
X = train_join2.drop(columns=['TripType', 'VisitNumber'])
Y = train_join2['TripType'].reset_index(drop=True)

xt1, xt2, yt1, yt2 = train_test_split(X, Y)
estimator = XGBClassifier(silent=False,max_depth=3, n_jobs=-1)
estimator.fit(xt1,yt1,eval_set=[(xt1, yt1), (xt2, yt2)],verbose=True,)

In [131]:
from sklearn.metrics import classification_report
print(classification_report(estimator.predict(xt2), yt2))

              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00        15
           6       0.00      0.00      0.00         4
           7       0.00      0.03      0.01        29
           8       0.82      0.13      0.22      2916
           9       0.03      0.12      0.05        99
          15       0.00      0.00      0.00         0
          18       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         0
          20       0.00      0.00      0.00         0
          21       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         2
          23       0.00      0.00      0.00         0
          24       0.00      0.00      0.00         4
          25       0.01      0.09      0.01        11
          26       0.00      0.00      0.00         1
          27       0.00    

## 1.2 Add more info
### Dept info

In [221]:
df_dept = traindf.pivot_table(index='VisitNumber', columns='DepartmentDescription', values='ScanCount').fillna(0)
df_dept.head()

DepartmentDescription,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,BRAS & SHAPEWEAR,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


- join with `dept_fine` weighted df

### Train data

In [224]:
train_join3 = train_join2.merge(df_dept, on='VisitNumber')
train_join3.head(3)

Unnamed: 0,VisitNumber,3,4,5,6,7,8,9,12,14,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
0,5,-0.063976,0.0,0.0,0.0,0.0,0.0,-0.003524,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,0.0,0.0,6.4e-05,0.0,3.7e-05,0.00017,5.9e-05,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,8,0.000824,0.020212,0.005848,0.009105,0.006319,0.009593,0.011689,0.006005,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [259]:
X = train_join3.drop(columns=['TripType', 'VisitNumber'])
Y = train_join3['TripType'].reset_index(drop=True)

In [262]:
xt1, xt2, yt1, yt2 = train_test_split(X, Y)
estimator = XGBClassifier(silent=False,max_depth=3, n_jobs=-1, objective='multi:softmax')
estimator.fit(xt1,yt1, eval_metric='mlogloss', eval_set=[(xt1, yt1), (xt2, yt2)],verbose=True)

[0]	validation_0-merror:0.461884	validation_1-merror:0.470958
[1]	validation_0-merror:0.440164	validation_1-merror:0.450982
[2]	validation_0-merror:0.429261	validation_1-merror:0.441769
[3]	validation_0-merror:0.417284	validation_1-merror:0.430629
[4]	validation_0-merror:0.412831	validation_1-merror:0.426358
[5]	validation_0-merror:0.406061	validation_1-merror:0.418317
[6]	validation_0-merror:0.403046	validation_1-merror:0.414423
[7]	validation_0-merror:0.399919	validation_1-merror:0.411826
[8]	validation_0-merror:0.398244	validation_1-merror:0.410193
[9]	validation_0-merror:0.396639	validation_1-merror:0.407094
[10]	validation_0-merror:0.393819	validation_1-merror:0.406173
[11]	validation_0-merror:0.391306	validation_1-merror:0.403032
[12]	validation_0-merror:0.389184	validation_1-merror:0.401943
[13]	validation_0-merror:0.387858	validation_1-merror:0.399724
[14]	validation_0-merror:0.386448	validation_1-merror:0.397965
[15]	validation_0-merror:0.384689	validation_1-merror:0.396415
[1

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=False, subsample=1)

## 1.3 Comparison: dept + dept_fine_weight vs dept
- start point: 0.46 vs 0.60
- showing weight effective

In [270]:
df_dept = df_dept.reset_index()
df_dept = df_dept.merge(temp)

X = df_dept.drop(columns=['TripType', 'VisitNumber'])
Y = df_dept['TripType'].reset_index(drop=True)

In [275]:
xt1, xt2, yt1, yt2 = train_test_split(X, Y)
estimator = XGBClassifier(silent=False,max_depth=3,\
                          n_jobs=-1, objective='multi:softmax',\
                         n_estimators=1)
# estimator.fit(xt1,yt1, eval_metric='mlogloss', eval_set=[(xt1, yt1), (xt2, yt2)],verbose=True)
estimator.fit(xt1,yt1,eval_set=[(xt1, yt1), (xt2, yt2)],verbose=True)

[0]	validation_0-merror:0.602566	validation_1-merror:0.604632


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=False, subsample=1)

## 1.4 Add more info: more broad dept. category
- WEARls, ACCls, FOODls, ELECls, COSls, HOUSEls, GARDENls, INFANls

### Train data

In [286]:
traindf['dept'] = traindf.apply(lambda x: x['DD_big'] if x['DD_big'] in check else 'else', axis=1)
df_dept2 = traindf.pivot_table(index='VisitNumber', columns='dept', values='ScanCount').fillna(0)
train_join4 = train_join3.merge(df_dept2, on='VisitNumber')
train_join4.tail()

Unnamed: 0,VisitNumber,3,4,5,6,7,8,9,12,14,...,WIRELESS,ACC,COS,ELEC,FOOD,GARDEN,HOUSE,INFAN,WEAR,else
95511,191343,0.00151,0.013474,0.00874,0.009611,0.007095,0.035698,0.013275,0.003603,0.012048,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.2,1.0
95512,191344,0.000412,0.0,0.000193,0.0,0.0,0.002037,0.000352,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
95513,191345,0.000686,0.0,0.002057,0.002276,0.01249,0.00781,0.000646,0.004003,0.0,...,0.0,0.0,1.5,1.0,1.428571,0.0,1.0,0.0,0.0,0.0
95514,191346,0.0,0.14052,0.007134,0.00784,0.022689,0.008404,0.001762,0.024019,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
95515,191347,0.0,0.0,0.001028,0.000253,0.001589,0.003353,0.000235,0.002802,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [287]:
X = train_join4.drop(columns=['TripType', 'VisitNumber'])
Y = train_join4['TripType'].reset_index(drop=True)

In [288]:
xt1, xt2, yt1, yt2 = train_test_split(X, Y)
estimator = XGBClassifier(silent=False,max_depth=3,\
                          n_jobs=-1, objective='multi:softmax',\
                         n_estimators=100)
# estimator.fit(xt1,yt1, eval_metric='mlogloss', eval_set=[(xt1, yt1), (xt2, yt2)],verbose=True)
estimator.fit(xt1,yt1,eval_set=[(xt1, yt1), (xt2, yt2)],verbose=True)

[0]	validation_0-merror:0.462094	validation_1-merror:0.47435
[1]	validation_0-merror:0.43719	validation_1-merror:0.44893
[2]	validation_0-merror:0.422798	validation_1-merror:0.433896
[3]	validation_0-merror:0.41614	validation_1-merror:0.428787
[4]	validation_0-merror:0.41057	validation_1-merror:0.422673
[5]	validation_0-merror:0.40507	validation_1-merror:0.415512
[6]	validation_0-merror:0.401985	validation_1-merror:0.412831
[7]	validation_0-merror:0.397853	validation_1-merror:0.409732
[8]	validation_0-merror:0.395159	validation_1-merror:0.407555
[9]	validation_0-merror:0.392172	validation_1-merror:0.405084
[10]	validation_0-merror:0.389505	validation_1-merror:0.402906
[11]	validation_0-merror:0.386797	validation_1-merror:0.401273
[12]	validation_0-merror:0.384243	validation_1-merror:0.398007
[13]	validation_0-merror:0.382428	validation_1-merror:0.395494
[14]	validation_0-merror:0.379664	validation_1-merror:0.393274
[15]	validation_0-merror:0.376928	validation_1-merror:0.391725
[16]	val

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=False, subsample=1)

In [301]:
print(classification_report(estimator.predict(xt2), yt2))

              precision    recall  f1-score   support

           3       0.98      0.88      0.93       998
           4       0.81      0.63      0.70        99
           5       0.82      0.82      0.82      1173
           6       0.84      0.77      0.80       368
           7       0.74      0.71      0.72      1495
           8       0.83      0.72      0.77      3457
           9       0.68      0.73      0.70      2193
          12       0.31      0.38      0.34        53
          15       0.53      0.50      0.51       244
          18       0.49      0.55      0.52       116
          19       0.74      0.64      0.69        86
          20       0.66      0.61      0.63       181
          21       0.51      0.62      0.56       130
          22       0.61      0.59      0.60       246
          23       0.89      0.77      0.83        22
          24       0.61      0.58      0.60       680
          25       0.74      0.53      0.62      1307
          26       0.53    

---

# 2. weight2: based on upc

In [289]:
df_upc = df.pivot_table(columns='TripType', index='Upc', values='ScanCount', aggfunc=sum,)
df_upc.tail()

TripType,3,4,5,6,7,8,9,12,14,15,...,36,37,38,39,40,41,42,43,44,999
Upc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
978276400000.0,,,,,,,,,,,...,,,,,,,1.0,,,
978968200000.0,,,,,,,,,,,...,,,,,,,,,,
978970600000.0,,,,,,,1.0,,,,...,,,,,1.0,,,,,
978970600000.0,,,,,,,,,,,...,,,,,,,1.0,,,
978970700000.0,,,,,,,,,,,...,,,,,,,,,,


### step1: make upc weight for each TripType

In [290]:
df_weight2 = pd.DataFrame(index=df_upc.index, columns=df_upc.columns)
for i in range(len(df_upc.columns)):
    df_weight2.iloc[:, i] = df_upc.iloc[:, i] / np.sum(abs(df_upc.iloc[:, i]))
df_weight2 = df_weight2.fillna(0)
df_weight2 = df_weight2.reset_index()
df_weight2.tail()

TripType,Upc,3,4,5,6,7,8,9,12,14,...,36,37,38,39,40,41,42,43,44,999
97709,978276400000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.6e-05,0.0,0.0,0.0
97710,978968200000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97711,978970600000.0,0.0,0.0,0.0,0.0,0.0,0.0,5.9e-05,0.0,0.0,...,0.0,0.0,0.0,0.0,5e-06,0.0,0.0,0.0,0.0,0.0
97712,978970600000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.6e-05,0.0,0.0,0.0
97713,978970700000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### step2: apply weight to train data

In [291]:
temp2 = df[['VisitNumber', 'Upc','ScanCount']]
df_upc_weight = temp2.merge(df_weight2, on='Upc', how='left')
df_upc_weight.tail()

for i in range(len(df_weight2.columns)-1):
    df_upc_weight.iloc[:, 3+i] = df_upc_weight['ScanCount'] * df_upc_weight.iloc[:, 3+i]
    
df_upc_weight = df_upc_weight.groupby('VisitNumber').agg(sum).drop(columns=['ScanCount', 'Upc'])
df_upc_weight.tail()

Unnamed: 0_level_0,3,4,5,6,7,8,9,12,14,15,...,36,37,38,39,40,41,42,43,44,999
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
191343,0.000137,0.005803,0.004233,0.00329,0.001849,0.013119,0.003956,0.001203,0.0,0.002339,...,0.002574,0.000761,0.001237,0.002033,0.001146,0.00481,0.003138,0.004021,0.002842,0.000761
191344,0.0,0.0,0.0,0.0,0.0,0.000128,0.0,0.0,0.0,0.0,...,8.2e-05,0.0,2.9e-05,1.9e-05,1e-05,0.0,4.6e-05,0.0,0.0,0.0
191345,0.0,0.0,0.000308,0.000253,0.001775,0.001661,0.000118,0.000802,0.0,0.000557,...,0.000695,0.001592,0.002676,0.002353,0.002389,0.000498,0.000277,0.000718,0.000732,0.0
191346,0.0,0.004836,0.000539,0.001266,0.002478,0.000469,0.000177,0.003208,0.0,0.000445,...,0.000409,0.004521,0.002014,0.001383,0.002174,0.000498,0.000508,0.001005,0.000646,0.000207
191347,0.0,0.0,7.7e-05,0.0,0.000259,0.000213,0.0,0.000802,0.0,0.0,...,0.0,6.9e-05,0.000748,0.000104,0.00018,0.0,9.2e-05,0.000144,0.0,6.9e-05


### Train data

In [300]:
train_join5 = train_join4.merge(df_upc_weight.reset_index(), on='VisitNumber')
train_join5 = train_join5.merge(temp)
train_join5.tail()

Unnamed: 0,VisitNumber,3_x,4_x,5_x,6_x,7_x,8_x,9_x,12_x,14_x,...,36_y,37_y,38_y,39_y,40_y,41_y,42_y,43_y,44_y,999_y
95511,191343,0.00151,0.013474,0.00874,0.009611,0.007095,0.035698,0.013275,0.003603,0.012048,...,0.002574,0.000761,0.001237,0.002033,0.001146,0.00481,0.003138,0.004021,0.002842,0.000761
95512,191344,0.000412,0.0,0.000193,0.0,0.0,0.002037,0.000352,0.0,0.0,...,8.2e-05,0.0,2.9e-05,1.9e-05,1e-05,0.0,4.6e-05,0.0,0.0,0.0
95513,191345,0.000686,0.0,0.002057,0.002276,0.01249,0.00781,0.000646,0.004003,0.0,...,0.000695,0.001592,0.002676,0.002353,0.002389,0.000498,0.000277,0.000718,0.000732,0.0
95514,191346,0.0,0.14052,0.007134,0.00784,0.022689,0.008404,0.001762,0.024019,0.0,...,0.000409,0.004521,0.002014,0.001383,0.002174,0.000498,0.000508,0.001005,0.000646,0.000207
95515,191347,0.0,0.0,0.001028,0.000253,0.001589,0.003353,0.000235,0.002802,0.0,...,0.0,6.9e-05,0.000748,0.000104,0.00018,0.0,9.2e-05,0.000144,0.0,6.9e-05


In [302]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

X = train_join5.drop(columns=['TripType', 'VisitNumber'])
Y = train_join5['TripType'].reset_index(drop=True)

In [306]:
xt1, xt2, yt1, yt2 = train_test_split(X, Y)
estimator = XGBClassifier(silent=False,max_depth=3,\
                          n_jobs=-1, objective='multi:softmax',\
                         n_estimators=100)
# estimator.fit(xt1,yt1, eval_metric='mlogloss', eval_set=[(xt1, yt1), (xt2, yt2)],verbose=True)
estimator.fit(xt1,yt1,eval_set=[(xt1, yt1), (xt2, yt2)],verbose=True)

[0]	validation_0-merror:0.42485	validation_1-merror:0.426819
[1]	validation_0-merror:0.384187	validation_1-merror:0.388123
[2]	validation_0-merror:0.375239	validation_1-merror:0.37958
[3]	validation_0-merror:0.365244	validation_1-merror:0.368608
[4]	validation_0-merror:0.353798	validation_1-merror:0.359186
[5]	validation_0-merror:0.346357	validation_1-merror:0.353532
[6]	validation_0-merror:0.342658	validation_1-merror:0.350727
[7]	validation_0-merror:0.338987	validation_1-merror:0.346162
[8]	validation_0-merror:0.335804	validation_1-merror:0.342812
[9]	validation_0-merror:0.33325	validation_1-merror:0.340676
[10]	validation_0-merror:0.330081	validation_1-merror:0.339043
[11]	validation_0-merror:0.328224	validation_1-merror:0.335986
[12]	validation_0-merror:0.324818	validation_1-merror:0.33452
[13]	validation_0-merror:0.322012	validation_1-merror:0.331421
[14]	validation_0-merror:0.318899	validation_1-merror:0.328615
[15]	validation_0-merror:0.316764	validation_1-merror:0.32694
[16]	va

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=False, subsample=1)

# 3. weight3: upc info after PCA

In [310]:
from sklearn.decomposition import PCA
pca = PCA()
df_low = pca.fit_transform(df_fine.T.fillna(0))
df_inverse = pca.inverse_transform(df_low)

In [314]:
cols = df_fine.T.columns
idx = df_fine.T.index
df_inverse = pd.DataFrame(df_inverse, index=idx, columns=cols)
df_inverse = df_inverse.T

### step1: de-noise upc by PCA

In [319]:
df_weight3 = pd.DataFrame(index=df_inverse.index, columns=df_inverse.columns)
for i in range(len(df_inverse.columns)):
    df_weight3.iloc[:, i] = df_inverse.iloc[:, i] / np.sum(abs(df_inverse.iloc[:, i]))
df_weight3 = df_weight3.reset_index()

TripType,3,4,5,6,7,8,9,12,14,15,...,36,37,38,39,40,41,42,43,44,999
dept_fine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
WIRELESS_965.0,0.0001372872,-1.666937e-17,-2.568639e-19,-1.123139e-19,-2.46151e-19,3.5815169999999995e-19,0.0004699248,3.644447e-18,2.006427e-16,0.0001110864,...,-1.992894e-19,4.607552e-05,1.187643e-18,9.39991e-06,4.870778e-06,1.982213e-18,0.0001841027,0.0001428163,-1.048684e-18,-8.3e-05
WIRELESS_970.0,-1.066936e-18,-1.389114e-17,-3.5675549999999996e-19,-1.29161e-18,-3.1999629999999995e-19,1.8850089999999999e-19,0.000293703,1.422223e-18,1.36437e-16,0.0001110864,...,-3.261099e-19,-3.5807869999999996e-19,8.173028999999999e-19,-4.0700389999999997e-19,-3.460896e-19,1.211353e-18,-1.062854e-18,5.073856e-19,-8.198805e-19,-8.3e-05
WIRELESS_990.0,-3.124598e-19,-1.869962e-18,7.135109e-20,-1.9654929999999997e-19,-4.512768e-20,7.540035e-20,1.304303e-19,1.1111119999999999e-19,2.808998e-17,-1.849961e-20,...,-1.2002659999999998e-19,0.0,1.612258e-19,-7.357377e-20,-4.2179669999999995e-20,3.028382e-19,-2.8615309999999997e-19,6.342319e-20,-2.3595399999999996e-19,-0.000249
WIRELESS_9998.0,-3.9019370000000004e-18,-7.180653000000001e-17,6.426735e-05,-8.985113e-19,0.0001108566,4.244662e-05,0.001879699,1.4222230000000002e-17,9.63085e-16,0.0001110864,...,4.079634e-05,2.303776e-05,2.875629e-05,3.759964e-05,2.922467e-05,9.690821e-18,0.0006903852,4.059084e-18,0.0001717402,-8.3e-05
nan_nan,0.000686436,0.0009624639,0.001028278,0.0007587253,0.0009238046,0.003225943,0.00487547,0.001601281,-2.568227e-16,0.002443901,...,0.001060705,0.001221001,0.0008339324,0.001109189,0.0009351894,0.003141015,0.002577438,0.002999143,0.002619037,-0.003406


In [341]:
df_weight3_add = traindf.merge(df_weight3, on='dept_fine').drop(columns=['Weekday', 'Upc', 'FinelineNumber', 'DD_big', 'DepartmentDescription', 'dept', 'TripType'])
df_weight3_add.tail()

Unnamed: 0,VisitNumber,ScanCount,dept_fine,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,999
647049,190366,1,INFANT APPAREL_6806.0,2.381553e-20,6.444689999999999e-19,1.114861e-21,-8.774524e-20,9.371686e-20,3.696384e-20,4.075948e-20,...,-3.2695909999999996e-20,1.230896e-20,1.1772669999999999e-20,-4.207011e-21,-5.999112e-21,1.4912479999999998e-20,7.8245e-21,0.0001428163,3.098385e-20,-2.046616e-20
647050,190540,1,COOK AND DINE_137.0,-6.715981e-20,-5.810237999999999e-19,-2.697963e-20,1.1143649999999999e-19,1.923055e-21,-9.277777e-21,-1.222784e-21,...,-1.4154079999999998e-20,2.078135e-20,1.9055769999999998e-20,9.39991e-06,2.517937e-21,-1.0897589999999999e-20,1.117786e-21,-1.981975e-21,2.085452e-21,3.5743719999999996e-20
647051,190822,1,OPTICAL - FRAMES_4005.0,-6.715981e-20,-5.810237999999999e-19,-2.697963e-20,1.1143649999999999e-19,1.923055e-21,-9.277777e-21,-1.222784e-21,...,-1.4154079999999998e-20,2.078135e-20,1.9055769999999998e-20,9.39991e-06,2.517937e-21,-1.0897589999999999e-20,1.117786e-21,-1.981975e-21,2.085452e-21,3.5743719999999996e-20
647052,190858,1,HARDWARE_1904.0,1.428932e-21,1.4024709999999999e-19,-6.689165e-21,1.8075519999999998e-19,-8.333237e-21,-1.075044e-20,-2.873543e-20,...,-3.6800599999999996e-21,-2.717562e-21,-3.691432e-21,-3.587374e-22,-4.376817e-21,3.269276e-20,4.602568e-05,-8.423393e-21,-4.036839e-20,1.398041e-19
647053,191124,1,AUTOMOTIVE_7130.0,9.526213e-21,-3.673139e-19,-2.229722e-21,8.248052999999999e-20,8.46144e-21,-1.6346559999999998e-20,-5.604428999999999e-20,...,6.836418e-20,4.4839769999999996e-20,-4.589348e-21,5.413673e-21,-1.757486e-21,-5.62086e-20,4.3433959999999997e-20,2.923413e-20,-4.60289e-20,-3.170814e-20


### Train data

In [356]:
for i in range(len(df_weight3.columns)-1):
    df_weight3_add.iloc[:, 3+i] = df_weight3_add['ScanCount'] * df_weight3_add.iloc[:, 3+i]
df_weight3_add = df_weight3_add.groupby('VisitNumber').agg(sum).drop(columns='ScanCount')
df_weight3_add = df_weight3_add.reset_index()
df_weight3_add = df_weight3_add.merge(temp)
df_weight3_add.tail()

Unnamed: 0,VisitNumber,3,4,5,6,7,8,9,12,14,...,37,38,39,40,41,42,43,44,999,TripType
95669,191343,0.001510159,0.01347449,0.00874,0.009610521,0.007094819,0.035698,0.013275,0.003602882,0.01204819,...,0.0022577,0.004227,0.006251,0.003176,0.013225,0.008975,0.009711511,0.006526,0.001828,25
95670,191344,0.0004118616,-5.150409e-17,0.000193,-1.432002e-17,5.825573e-19,0.002037,0.000352,-6.222226999999999e-19,-2.916007e-16,...,-2.248223e-18,8.6e-05,0.000395,0.000307,0.000165,0.00069,-3.519987e-18,0.000816,-0.000914,22
95671,191345,0.000686436,2.105043e-17,0.002057,0.002276176,0.01248984,0.00781,0.000646,0.004003203,-1.825849e-15,...,0.007326007,0.016592,0.011261,0.013599,0.004133,0.002163,0.00271351,0.004165,-0.000415,39
95672,191346,-5.101783e-16,0.1405197,0.007134,0.007840162,0.02268864,0.008404,0.001762,0.02401922,9.363327e-16,...,0.02260004,0.027434,0.017286,0.02075,0.004298,0.003314,0.008283348,0.0076,0.000914,39
95673,191347,-1.580894e-16,-8.548397e-18,0.001028,0.0002529084,0.001588944,0.003353,0.000235,0.002802242,-1.123599e-16,...,0.003478702,0.019065,0.003215,0.005085,0.001323,0.000736,0.00157098,0.001889,-0.000166,8


### Only use `upc_pca_weight feature`
- Models above heat **0.42485** with 115 features at the beginning
- But `upc_pca_weight feature` heat **0.487757** at the beginning

In [363]:
X = df_weight3_add.drop(columns=['TripType', 'VisitNumber'])
Y = df_weight3_add['TripType'].reset_index(drop=True)

xt1, xt2, yt1, yt2 = train_test_split(X, Y)
estimator = XGBClassifier(silent=False,max_depth=3, n_jobs=-1, n_estimators=1)
estimator.fit(xt1,yt1,eval_set=[(xt1, yt1), (xt2, yt2)],verbose=True,)

[0]	validation_0-merror:0.487757	validation_1-merror:0.49237


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=1,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=False, subsample=1)

# 3. Get all kinds of weights together
### Metric: log_loss

In [366]:
train_join6 = train_join5.merge(df_weight3_add)
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

X = train_join6.drop(columns=['TripType', 'VisitNumber'])
Y = train_join6['TripType'].reset_index(drop=True)

xt1, xt2, yt1, yt2 = train_test_split(X, Y)

In [379]:
estimator = XGBClassifier(silent=False,max_depth=3,\
                          n_jobs=-1, objective='multi:softmax',\
                         n_estimators=100)
estimator.fit(xt1,yt1, eval_metric='mlogloss', eval_set=[(xt1, yt1), (xt2, yt2)],verbose=True)
#estimator.fit(xt1,yt1,eval_set=[(xt1, yt1), (xt2, yt2)],verbose=True)

[0]	validation_0-mlogloss:2.92232	validation_1-mlogloss:2.93529
[1]	validation_0-mlogloss:2.63613	validation_1-mlogloss:2.65404
[2]	validation_0-mlogloss:2.43081	validation_1-mlogloss:2.45231
[3]	validation_0-mlogloss:2.26975	validation_1-mlogloss:2.29288
[4]	validation_0-mlogloss:2.13933	validation_1-mlogloss:2.16471
[5]	validation_0-mlogloss:2.02508	validation_1-mlogloss:2.05194
[6]	validation_0-mlogloss:1.92784	validation_1-mlogloss:1.95637
[7]	validation_0-mlogloss:1.83967	validation_1-mlogloss:1.86961
[8]	validation_0-mlogloss:1.76208	validation_1-mlogloss:1.79355
[9]	validation_0-mlogloss:1.69423	validation_1-mlogloss:1.72683
[10]	validation_0-mlogloss:1.63261	validation_1-mlogloss:1.66596
[11]	validation_0-mlogloss:1.57726	validation_1-mlogloss:1.61144
[12]	validation_0-mlogloss:1.52636	validation_1-mlogloss:1.56156
[13]	validation_0-mlogloss:1.47994	validation_1-mlogloss:1.5159
[14]	validation_0-mlogloss:1.43787	validation_1-mlogloss:1.47418
[15]	validation_0-mlogloss:1.39758	v

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=-1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=False, subsample=1)