In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from joblib import Parallel, delayed
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import *


__AUTHOR__ = 'Kirgsn'

class Reducer:
    """
    Class that takes a dict of increasingly big numpy datatypes to transform
    the data of a pandas dataframe to in order to save memory usage.
    """
    memory_scale_factor = 1024**2  # memory in MB

    def __init__(self, conv_table=None):
        """
        :param conv_table: dict with np.dtypes-strings as keys
        """
        if conv_table is None:
            self.conversion_table = \
                {'int': [np.int8, np.int16, np.int32, np.int64],
                 'uint': [np.uint8, np.uint16, np.uint32, np.uint64],
                 'float': [np.float16, np.float32, ]}
        else:
            self.conversion_table = conv_table

    def _type_candidates(self, k):
        for c in self.conversion_table[k]:
            i = np.iinfo(c) if 'int' in k else np.finfo(c)
            yield c, i

    def reduce(self, df, verbose=False):
        """Takes a dataframe and returns it with all data transformed to the
        smallest necessary types.

        :param df: pandas dataframe
        :param verbose: If True, outputs more information
        :return: pandas dataframe with reduced data types
        """
        ret_list = Parallel(n_jobs=-1)(delayed(self._reduce)
                                                (df[c], c, verbose) for c in
                                                df.columns)

        return pd.concat(ret_list, axis=1)

    def _reduce(self, s, colname, verbose):

        # skip NaNs
        if s.isnull().any():
            if verbose:
                print(colname, 'has NaNs - Skip..')
            return s

        # detect kind of type
        coltype = s.dtype
        if np.issubdtype(coltype, np.integer):
            conv_key = 'int' if s.min() < 0 else 'uint'
        elif np.issubdtype(coltype, np.floating):
            conv_key = 'float'
        else:
            if verbose:
                print(colname, 'is', coltype, '- Skip..')
            print(colname, 'is', coltype, '- Skip..')
            return s

        # find right candidate
        for cand, cand_info in self._type_candidates(conv_key):
            if s.max() <= cand_info.max and s.min() >= cand_info.min:

                if verbose:
                    print('convert', colname, 'to', str(cand))
                return s.astype(cand)

        # reaching this code is bad. Probably there are inf, or other high numbs
        print(("WARNING: {} " 
               "doesn't fit the grid with \nmax: {} "
               "and \nmin: {}").format(colname, s.max(), s.min()))
        print('Dropping it..')
        

df = pd.read_csv("train.csv")

reducer = Reducer()
df = reducer.reduce(df)

c = df['DepartmentDescription'].isnull()
df.loc[c, 'DepartmentDescription'] = 'Na'
df['sum'] = df.assign(f=df.groupby(['VisitNumber', 'DepartmentDescription'])['ScanCount'].transform(sum))['f']
df_s = df.iloc[:, [0, 1, 5, -1]]
df_s = df_s.drop_duplicates(['VisitNumber', 'DepartmentDescription']).reset_index(drop=True)

# 가중치 테이블
data = df.iloc[:, [0, 1, 5, -1]].drop_duplicates(['VisitNumber', 'DepartmentDescription'])
sub = data.groupby(['TripType', 'DepartmentDescription'], as_index=False).agg('sum').iloc[:, [0,1,3]]
c = sub['TripType'] == sub['TripType'].unique()[0]
base = minmax_scale(sub[c]['sum'])

for i in range(1, 38):
    c = sub['TripType'] == sub['TripType'].unique()[i]
    base = np.hstack([base, minmax_scale(sub[c]['sum'])]) 
    
sub['minmax'] = pd.Series(base)

# TripType 41,42,43,44 구분
- 목표 Dept 만으로 41,42,43,44 분류해내기
- 결론: 불가 -> FN과 Upc의 영향을 많이 받는 것으로 추측

### test_x1: 41, .., 44, TheRest로 label
- 총 클래스 갯수: 5
- 각 TT별 Dept 가중치를 구함
- TT 41, ..., 44별 dept 가중치를 곱해서 합산한 ScanCount값을 컬럼으로 만듦

### test_x1.2: 41, .., 44 제외한 클래스는 배제
- 총 클래스 갯수: 4개
- feature 컬럼 상동

### test_x2: 41, .., 44, TheRest로 label
- 총 클래스 갯수: 5개
- TT 3, ..., 999별 dept 가중치를 곱해서 합산한 ScanCount값을 컬럼으로 만듦

### test_x3: 41, .., 44 제외한 클래스는 배제
- 총 클래스 갯수: 4개
- TT 41, ..., 44별 dept 가중치 곱한 ScanCount값으로 Dept Crosstab

# test_X 1

In [21]:
ls = [41, 42, 43, 44]
temp = df_s.copy()

for i in ls:
    c = sub['TripType'] == i
    temp = temp.merge(sub[c].iloc[:, [1, -1]], how='outer',on='DepartmentDescription')
    col_name = "{}_sum".format(i)
    temp.loc[:, col_name] = temp['sum'] * temp['minmax']
    temp.drop('minmax', axis=1, inplace=True)

test = temp.fillna(-1.0).groupby(['VisitNumber', 'TripType'], as_index=False).agg(sum)

x = test.merge(df_s.pivot('VisitNumber', 'DepartmentDescription', 'sum').fillna(0), on='VisitNumber').set_index('VisitNumber').drop(['TripType', 'sum', 'HEALTH AND BEAUTY AIDS'], axis=1)
y = test['TripType']

# mapping
lab4 = {41: 1, 42: 2, 43:3, 44: 4}
y_lab = y.map(lab4).fillna(5)

x.sample(5)

Unnamed: 0_level_0,41_sum,42_sum,43_sum,44_sum,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,BRAS & SHAPEWEAR,CAMERAS AND SUPPLIES,"CANDY, TOBACCO, COOKIES",CELEBRATION,COMM BREAD,CONCEPT STORES,COOK AND DINE,DAIRY,DSD GROCERY,ELECTRONICS,FABRICS AND CRAFTS,FINANCIAL SERVICES,FROZEN FOODS,FURNITURE,"GIRLS WEAR, 4-6X AND 7-14",GROCERY DRY GOODS,HARDWARE,HOME DECOR,HOME MANAGEMENT,HORTICULTURE AND ACCESS,HOUSEHOLD CHEMICALS/SUPP,HOUSEHOLD PAPER GOODS,IMPULSE MERCHANDISE,INFANT APPAREL,INFANT CONSUMABLE HARDLINES,JEWELRY AND SUNGLASSES,LADIES SOCKS,LADIESWEAR,LARGE HOUSEHOLD GOODS,LAWN AND GARDEN,"LIQUOR,WINE,BEER",MEAT - FRESH & FROZEN,MEDIA AND GAMING,MENS WEAR,MENSWEAR,Na,OFFICE SUPPLIES,OPTICAL - FRAMES,OPTICAL - LENSES,OTHER DEPARTMENTS,PAINT AND ACCESSORIES,PERSONAL CARE,PETS AND SUPPLIES,PHARMACY OTC,PHARMACY RX,PLAYERS AND ELECTRONICS,PLUS AND MATERNITY,PRE PACKED DELI,PRODUCE,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1
142760,0.291005,0.22439,0.330909,0.502392,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39861,4.550265,3.25784,4.809091,3.777512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70197,4.724868,3.641115,5.647273,4.545933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61763,0.521164,1.012544,0.594545,0.420096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
118328,0.026455,0.038328,0.014545,0.02201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
x.info()

<class 'pandas.core.frame.DataFrame'>
UInt64Index: 95674 entries, 5 to 191347
Data columns (total 72 columns):
41_sum                         95674 non-null float64
42_sum                         95674 non-null float64
43_sum                         95674 non-null float64
44_sum                         95674 non-null float64
1-HR PHOTO                     95674 non-null float64
ACCESSORIES                    95674 non-null float64
AUTOMOTIVE                     95674 non-null float64
BAKERY                         95674 non-null float64
BATH AND SHOWER                95674 non-null float64
BEAUTY                         95674 non-null float64
BEDDING                        95674 non-null float64
BOOKS AND MAGAZINES            95674 non-null float64
BOYS WEAR                      95674 non-null float64
BRAS & SHAPEWEAR               95674 non-null float64
CAMERAS AND SUPPLIES           95674 non-null float64
CANDY, TOBACCO, COOKIES        95674 non-null float64
CELEBRATION              

---

### oversampling

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x, y_lab, random_state=0)

# # ADASYN
X_samp, y_samp = ADASYN(random_state=0).fit_sample(X_train, y_train)

# oversampling
# X_samp, y_samp = RandomOverSampler(random_state=0).fit_sample(X_train, y_train)

### gbm

In [5]:
import lightgbm

gbm3 = lightgbm.LGBMClassifier(n_estimators=200, max_depth=2, random_state=0)
gbm3.fit(X_samp, y_samp)
print(classification_report(y_test, gbm3.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.08      0.43      0.13       144
         2.0       0.14      0.47      0.22       458
         3.0       0.04      0.68      0.08       188
         4.0       0.10      0.76      0.18       295
         5.0       1.00      0.71      0.83     22834

   micro avg       0.71      0.71      0.71     23919
   macro avg       0.27      0.61      0.29     23919
weighted avg       0.96      0.71      0.80     23919



In [6]:
gbm3.get_params

<bound method LGBMModel.get_params of LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=2,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=200, n_jobs=-1, num_leaves=31, objective=None,
        random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)>

In [13]:
from sklearn.model_selection import GridSearchCV

grid_param = {  
    'n_estimators': [100, 200],
    'max_depth': [2,3,4,6]
}

gd_sr2 = GridSearchCV(estimator=gbm3,  
                     param_grid=grid_param,
                     scoring='neg_log_loss',
                     cv=5,
                     n_jobs=-1)

gd_sr2.fit(X_samp, y_samp)
print(classification_report(y_test, gd_sr2.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.10      0.24      0.14       144
         2.0       0.19      0.57      0.28       458
         3.0       0.08      0.59      0.14       188
         4.0       0.19      0.63      0.29       295
         5.0       0.99      0.86      0.92     22834

   micro avg       0.85      0.85      0.85     23919
   macro avg       0.31      0.58      0.35     23919
weighted avg       0.96      0.85      0.89     23919



In [12]:
print(confusion_matrix(y_test, gd_sr2.best_estimator_.predict(X_test)))

[[   17    50    20    29    28]
 [   23   258    33    52    92]
 [    8    31    80    22    47]
 [    4    42    24   159    66]
 [  177   758   655   400 20844]]


### gbm2

In [46]:
gbm2 = lightgbm.LGBMClassifier(n_estimators=300, max_depth=2, random_state=0)
gbm2.fit(X_samp, y_samp)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=2,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=300, n_jobs=-1, num_leaves=31, objective=None,
        random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [47]:
print(classification_report(y_test, gbm2.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.07      0.47      0.13       144
         2.0       0.15      0.45      0.22       458
         3.0       0.04      0.64      0.08       188
         4.0       0.09      0.72      0.16       295
         5.0       1.00      0.72      0.84     22834

   micro avg       0.71      0.71      0.71     23919
   macro avg       0.27      0.60      0.29     23919
weighted avg       0.96      0.71      0.81     23919



### random forest

In [48]:
rf2 = RandomForestClassifier(n_estimators=300, n_jobs=-1, criterion='gini', max_depth=3, )
rf2.fit(X_samp, y_samp)
print(classification_report(y_test, rf2.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.06      0.30      0.09       144
         2.0       0.15      0.15      0.15       458
         3.0       0.02      0.70      0.04       188
         4.0       0.04      0.72      0.08       295
         5.0       1.00      0.47      0.64     22834

   micro avg       0.46      0.46      0.46     23919
   macro avg       0.25      0.47      0.20     23919
weighted avg       0.95      0.46      0.61     23919



### knn

In [9]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, n_jobs=-2)
knn.fit(X_samp,y_samp)
print(classification_report(y_test, knn.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.05      0.14      0.07       144
         2.0       0.15      0.38      0.22       458
         3.0       0.08      0.30      0.13       188
         4.0       0.18      0.37      0.24       295
         5.0       0.99      0.91      0.94     22834

   micro avg       0.88      0.88      0.88     23919
   macro avg       0.29      0.42      0.32     23919
weighted avg       0.95      0.88      0.91     23919



In [10]:
print(confusion_matrix(y_test, knn.predict(X_test)))

[[   20    55    14    17    38]
 [   50   172    46    62   128]
 [   11    31    57    21    68]
 [   30    57    29   110    69]
 [  329   826   561   417 20701]]


---

# test_X1.2

In [22]:
idx = np.where(y_lab != 5)[0]
x2 = x.iloc[idx, :]
y2 = y_lab[idx]

X_train, X_test, y_train, y_test = train_test_split(x2, y2, random_state=0)

# # ADASYN
# X_samp, y_samp = ADASYN(random_state=0).fit_sample(X_train, y_train)
x2.tail()

Unnamed: 0_level_0,41_sum,42_sum,43_sum,44_sum,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,BRAS & SHAPEWEAR,CAMERAS AND SUPPLIES,"CANDY, TOBACCO, COOKIES",CELEBRATION,COMM BREAD,CONCEPT STORES,COOK AND DINE,DAIRY,DSD GROCERY,ELECTRONICS,FABRICS AND CRAFTS,FINANCIAL SERVICES,FROZEN FOODS,FURNITURE,"GIRLS WEAR, 4-6X AND 7-14",GROCERY DRY GOODS,HARDWARE,HOME DECOR,HOME MANAGEMENT,HORTICULTURE AND ACCESS,HOUSEHOLD CHEMICALS/SUPP,HOUSEHOLD PAPER GOODS,IMPULSE MERCHANDISE,INFANT APPAREL,INFANT CONSUMABLE HARDLINES,JEWELRY AND SUNGLASSES,LADIES SOCKS,LADIESWEAR,LARGE HOUSEHOLD GOODS,LAWN AND GARDEN,"LIQUOR,WINE,BEER",MEAT - FRESH & FROZEN,MEDIA AND GAMING,MENS WEAR,MENSWEAR,Na,OFFICE SUPPLIES,OPTICAL - FRAMES,OPTICAL - LENSES,OTHER DEPARTMENTS,PAINT AND ACCESSORIES,PERSONAL CARE,PETS AND SUPPLIES,PHARMACY OTC,PHARMACY RX,PLAYERS AND ELECTRONICS,PLUS AND MATERNITY,PRE PACKED DELI,PRODUCE,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1
191232,4.730159,4.011847,5.276364,4.746411,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191277,13.931217,11.158188,11.583636,10.245455,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
191283,5.724868,4.439721,3.978182,3.100957,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
191301,3.439153,3.722648,2.725455,2.277512,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
191312,14.330688,11.006969,11.550909,10.405263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0


In [5]:
x2.info()

<class 'pandas.core.frame.DataFrame'>
UInt64Index: 4500 entries, 12 to 191312
Data columns (total 72 columns):
41_sum                         4500 non-null float64
42_sum                         4500 non-null float64
43_sum                         4500 non-null float64
44_sum                         4500 non-null float64
1-HR PHOTO                     4500 non-null float64
ACCESSORIES                    4500 non-null float64
AUTOMOTIVE                     4500 non-null float64
BAKERY                         4500 non-null float64
BATH AND SHOWER                4500 non-null float64
BEAUTY                         4500 non-null float64
BEDDING                        4500 non-null float64
BOOKS AND MAGAZINES            4500 non-null float64
BOYS WEAR                      4500 non-null float64
BRAS & SHAPEWEAR               4500 non-null float64
CAMERAS AND SUPPLIES           4500 non-null float64
CANDY, TOBACCO, COOKIES        4500 non-null float64
CELEBRATION                    4500 non-n

### gbm

In [64]:
import lightgbm

gbm4 = lightgbm.LGBMClassifier(n_estimators=300, max_depth=3, random_state=0)
gbm4.fit(X_train, y_train)
print(classification_report(y_test, gbm4.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.45      0.20      0.28       142
         2.0       0.64      0.77      0.70       472
         3.0       0.60      0.54      0.57       215
         4.0       0.69      0.70      0.69       296

   micro avg       0.63      0.63      0.63      1125
   macro avg       0.59      0.55      0.56      1125
weighted avg       0.62      0.63      0.62      1125



### Random forest

In [59]:
rf2 = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='gini', max_depth=3, )
rf2.fit(X_samp, y_samp)
print(classification_report(y_test, rf2.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.28      0.37      0.32       142
         2.0       0.75      0.12      0.21       472
         3.0       0.36      0.73      0.48       215
         4.0       0.49      0.70      0.58       296

   micro avg       0.42      0.42      0.42      1125
   macro avg       0.47      0.48      0.40      1125
weighted avg       0.55      0.42      0.37      1125



### Extreme forest

In [76]:
# fitting
er = ExtraTreesClassifier(n_jobs=-1, n_estimators=300, random_state=0, max_depth=3)
er.fit(X_samp, y_samp)

print(classification_report(y_test, er.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.33      0.40      0.36       142
         2.0       0.84      0.09      0.16       472
         3.0       0.33      0.82      0.47       215
         4.0       0.55      0.69      0.61       296

   micro avg       0.42      0.42      0.42      1125
   macro avg       0.51      0.50      0.40      1125
weighted avg       0.60      0.42      0.36      1125



---

# X_test2

In [6]:
ls = df['TripType'].unique()
temp = df_s.copy()

for i in ls:
    c = sub['TripType'] == i
    temp = temp.merge(sub[c].iloc[:, [1, -1]], how='outer',on='DepartmentDescription')
    col_name = "{}_sum".format(i)
    temp.loc[:, col_name] = temp['sum'] * temp['minmax']
    temp.drop('minmax', axis=1, inplace=True)

test = temp.fillna(-0.000001).groupby(['VisitNumber', 'TripType'], as_index=False).agg(sum)

x = test.merge(df_s.pivot('VisitNumber', 'DepartmentDescription', 'sum').fillna(0), on='VisitNumber').set_index('VisitNumber').drop(['TripType', 'sum', 'HEALTH AND BEAUTY AIDS'], axis=1)
y = test['TripType']

# mapping
lab4 = {41: 1, 42: 2, 43:3, 44: 4}
y_lab = y.map(lab4).fillna(5)

In [7]:
x.tail()

Unnamed: 0_level_0,999_sum,30_sum,26_sum,8_sum,35_sum,41_sum,21_sum,6_sum,42_sum,7_sum,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
191343,3.330904,2.62033,0.590255,2.536313,2.291976,7.089947,0.434349,0.950739,6.112195,1.255233,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191344,1.845481,0.252824,0.12846,0.895251,0.018019,1.441799,0.067995,0.076355,1.30662,0.072474,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
191345,7.516764,2.331885,1.522702,6.920391,5.46536,7.880952,0.746776,2.421182,5.967944,6.002926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191346,8.31414,2.312772,1.735327,6.56245,3.926215,7.365079,0.659437,2.424466,6.101742,9.55413,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191347,1.281341,0.23371,0.189369,0.705706,0.168513,0.693122,0.054513,0.230706,0.659233,1.316453,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
x.info(), x.columns

<class 'pandas.core.frame.DataFrame'>
UInt64Index: 95674 entries, 5 to 191347
Columns: 106 entries, 999_sum to WIRELESS
dtypes: float64(106)
memory usage: 78.1 MB


(None,
 Index(['999_sum', '30_sum', '26_sum', '8_sum', '35_sum', '41_sum', '21_sum',
        '6_sum', '42_sum', '7_sum',
        ...
        'SEAFOOD', 'SEASONAL', 'SERVICE DELI', 'SHEER HOSIERY', 'SHOES',
        'SLEEPWEAR/FOUNDATIONS', 'SPORTING GOODS', 'SWIMWEAR/OUTERWEAR', 'TOYS',
        'WIRELESS'],
       dtype='object', length=106))

# sampling

In [4]:
# mapping
lab4 = {41: 1, 42: 2, 43:3, 44: 4}
y_lab = y.map(lab4).fillna(5)

# split
X_train, X_test, y_train, y_test = train_test_split(x, y_lab, random_state=0)

# # ADASYN
# X_samp, y_samp = ADASYN(random_state=0).fit_sample(X_train, y_train)

# oversampling
X_samp, y_samp = RandomOverSampler(random_state=0).fit_sample(X_train, y_train)

# fitting

In [6]:
# fitting
er = ExtraTreesClassifier(n_jobs=-1, n_estimators=250, random_state=0)
er.fit(X_samp, y_samp)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

# validation

## Extra

In [107]:
# fitting
er = ExtraTreesClassifier(n_jobs=-1, n_estimators=250, random_state=0)
er.fit(X_samp, y_samp)

print(classification_report(y_test, er.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00       144
         2.0       0.10      0.01      0.02       458
         3.0       0.12      0.01      0.01       188
         4.0       0.00      0.00      0.00       295
         5.0       0.95      1.00      0.97     22834

   micro avg       0.95      0.95      0.95     23919
   macro avg       0.24      0.20      0.20     23919
weighted avg       0.91      0.95      0.93     23919



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [7]:
# ADASYNC
er = ExtraTreesClassifier(n_jobs=-1, n_estimators=250, random_state=0)
er.fit(X_samp, y_samp)

print(classification_report(y_test, er.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.11      0.03      0.05       144
         2.0       0.31      0.34      0.33       458
         3.0       0.14      0.07      0.10       188
         4.0       0.37      0.25      0.30       295
         5.0       0.97      0.98      0.98     22834

   micro avg       0.95      0.95      0.95     23919
   macro avg       0.38      0.34      0.35     23919
weighted avg       0.94      0.95      0.94     23919



## Random

In [110]:
# random over
rf = RandomForestClassifier(n_estimators=300, n_jobs=-1, criterion='gini', max_depth=3, )
rf.fit(X_samp, y_samp)
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.02      0.21      0.04       144
         2.0       0.19      0.19      0.19       458
         3.0       0.02      0.76      0.05       188
         4.0       0.04      0.70      0.08       295
         5.0       1.00      0.49      0.66     22834

   micro avg       0.49      0.49      0.49     23919
   macro avg       0.25      0.47      0.20     23919
weighted avg       0.96      0.49      0.63     23919



In [6]:
# ADASYN
rf = RandomForestClassifier(n_estimators=300, n_jobs=-1, criterion='gini', max_depth=3, )
rf.fit(X_samp, y_samp)
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.03      0.22      0.05       144
         2.0       0.11      0.19      0.14       458
         3.0       0.03      0.74      0.05       188
         4.0       0.04      0.74      0.08       295
         5.0       1.00      0.50      0.66     22834

   micro avg       0.49      0.49      0.49     23919
   macro avg       0.24      0.48      0.20     23919
weighted avg       0.95      0.49      0.64     23919



# linear

In [10]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=100)
lr.fit(X_samp, y_samp)
print(classification_report(y_test, lr.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.06      0.54      0.11       144
         2.0       0.14      0.51      0.22       458
         3.0       0.06      0.46      0.11       188
         4.0       0.14      0.65      0.23       295
         5.0       0.99      0.79      0.88     22834

   micro avg       0.78      0.78      0.78     23919
   macro avg       0.28      0.59      0.31     23919
weighted avg       0.95      0.78      0.85     23919



# lightGBM

In [12]:
import lightgbm

gbm = lightgbm.LGBMClassifier(n_estimators=500, max_depth=2, random_state=0)
gbm.fit(X_samp, y_samp)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=2,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=500, n_jobs=-1, num_leaves=31, objective=None,
        random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [13]:
print(classification_report(y_test, gbm.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.09      0.42      0.14       144
         2.0       0.16      0.52      0.25       458
         3.0       0.05      0.64      0.09       188
         4.0       0.14      0.73      0.23       295
         5.0       1.00      0.78      0.87     22834

   micro avg       0.77      0.77      0.77     23919
   macro avg       0.29      0.62      0.32     23919
weighted avg       0.96      0.77      0.84     23919



# grid search

In [18]:
from sklearn.model_selection import GridSearchCV
gbm.get_params

<bound method LGBMModel.get_params of LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=2,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=500, n_jobs=-1, num_leaves=31, objective=None,
        random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)>

In [19]:
grid_param = {  
    'n_estimators': [100, 200, 250, 300, 400],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

gd_sr = GridSearchCV(estimator=gbm,  
                     param_grid=grid_param,
                     scoring='accuracy',
                     cv=5,
                     n_jobs=-1)

gd_sr.fit(X_samp, y_samp)  

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=2,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=500, n_jobs=-1, num_leaves=31, objective=None,
        random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [100, 200, 250, 300, 400], 'criterion': ['gini', 'entropy'], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [None]:
gd_sr.best_estimator_

In [20]:
gd_sr.best_params_

{'bootstrap': True, 'criterion': 'gini', 'n_estimators': 400}

In [21]:
gd_sr.best_score_

0.7544629792215394

In [23]:
gd_gbm = gd_sr.best_estimator_
print(classification_report(y_test, gd_gbm.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.09      0.44      0.15       144
         2.0       0.16      0.51      0.24       458
         3.0       0.05      0.63      0.09       188
         4.0       0.13      0.73      0.21       295
         5.0       1.00      0.77      0.87     22834

   micro avg       0.76      0.76      0.76     23919
   macro avg       0.28      0.62      0.31     23919
weighted avg       0.96      0.76      0.84     23919



In [25]:
gd_gbm.feature_importances_

array([210,  94,  74, 162,  60, 110,  88, 105,  49, 104, 267, 126, 111,
        36,  83, 194,  56,  70,  34,  87, 152,  97,  75,  67, 167,  39,
        84,  94, 111,  86,  37,  70,  50,  73,  28,  69, 141,  61,  16,
        67,  40,  13,  61,  96,  41,  45,  48,  51,   5,  28,  55,  39,
         0,  80,  13,   0,  74,   5,  16,  37,  29,  63,  32,  62,  75,
        42,  37,  13,  27,  37,  91,  37,  49,  15,  77,   0,  47,  53,
        27,  69,  50,  17,  13,  49,   8,   0,   0,  43,   2,  17,  16,
        16,   3,  16,  27,   9,  19,  19,  26,  32,  75,  38,  46,  34,
        51,  26])

In [26]:
gd_gbm.learning_rate

0.1

In [30]:
gd_gbm.get_params

<bound method LGBMModel.get_params of LGBMClassifier(boosting_type='gbdt', bootstrap=True, class_weight=None,
        colsample_bytree=1.0, criterion='gini', importance_type='split',
        learning_rate=0.1, max_depth=2, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=400,
        n_jobs=-1, num_leaves=31, objective=None, random_state=0,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)>

In [32]:
X_test.columns

Index(['999_sum', '30_sum', '26_sum', '8_sum', '35_sum', '41_sum', '21_sum',
       '6_sum', '42_sum', '7_sum',
       ...
       'SEAFOOD', 'SEASONAL', 'SERVICE DELI', 'SHEER HOSIERY', 'SHOES',
       'SLEEPWEAR/FOUNDATIONS', 'SPORTING GOODS', 'SWIMWEAR/OUTERWEAR', 'TOYS',
       'WIRELESS'],
      dtype='object', length=106)

---

# X_test3
- 41,42,43,44

In [23]:
idx = np.where(y_lab != 5)[0]
x2 = x.iloc[idx, :]
y2 = y_lab[idx].reset_index(drop=True)

x3 = x2.drop(x2.columns[:4], axis=1).copy()
b = x3.stack().reset_index().rename(columns={'level_1': 'DepartmentDescription', 0:'ScanCount'}).reset_index(drop=True)
bc = b.copy()

for i in [41, 42, 43, 44]:
    criteria = sub['TripType'] == i
    a = sub[criteria].loc[:, ['DepartmentDescription', 'minmax']]
    temp = b.merge(a, how='outer', on='DepartmentDescription')
    col_name = "{}_sum".format(i)
    bc[col_name] = temp['ScanCount'] * temp['minmax']
bc = bc.fillna(-0.00001)

In [37]:
x_test3 = pd.concat([bc.pivot('VisitNumber', 'DepartmentDescription', '41_sum'),\
          bc.pivot('VisitNumber', 'DepartmentDescription', '42_sum'),\
          bc.pivot('VisitNumber', 'DepartmentDescription', '43_sum'),\
          bc.pivot('VisitNumber', 'DepartmentDescription', '44_sum'),], axis=1)

In [38]:
x_test3.tail()

DepartmentDescription,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,BRAS & SHAPEWEAR,CAMERAS AND SUPPLIES,"CANDY, TOBACCO, COOKIES",CELEBRATION,COMM BREAD,CONCEPT STORES,COOK AND DINE,DAIRY,DSD GROCERY,ELECTRONICS,FABRICS AND CRAFTS,FINANCIAL SERVICES,FROZEN FOODS,FURNITURE,"GIRLS WEAR, 4-6X AND 7-14",GROCERY DRY GOODS,HARDWARE,HOME DECOR,HOME MANAGEMENT,HORTICULTURE AND ACCESS,HOUSEHOLD CHEMICALS/SUPP,HOUSEHOLD PAPER GOODS,IMPULSE MERCHANDISE,INFANT APPAREL,INFANT CONSUMABLE HARDLINES,JEWELRY AND SUNGLASSES,LADIES SOCKS,LADIESWEAR,LARGE HOUSEHOLD GOODS,LAWN AND GARDEN,"LIQUOR,WINE,BEER",MEAT - FRESH & FROZEN,MEDIA AND GAMING,MENS WEAR,MENSWEAR,Na,OFFICE SUPPLIES,OPTICAL - FRAMES,OPTICAL - LENSES,OTHER DEPARTMENTS,PAINT AND ACCESSORIES,PERSONAL CARE,PETS AND SUPPLIES,PHARMACY OTC,PHARMACY RX,PLAYERS AND ELECTRONICS,PLUS AND MATERNITY,PRE PACKED DELI,PRODUCE,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,BRAS & SHAPEWEAR,CAMERAS AND SUPPLIES,"CANDY, TOBACCO, COOKIES",CELEBRATION,COMM BREAD,CONCEPT STORES,COOK AND DINE,DAIRY,DSD GROCERY,ELECTRONICS,FABRICS AND CRAFTS,FINANCIAL SERVICES,FROZEN FOODS,FURNITURE,"GIRLS WEAR, 4-6X AND 7-14",GROCERY DRY GOODS,HARDWARE,HOME DECOR,HOME MANAGEMENT,HORTICULTURE AND ACCESS,HOUSEHOLD CHEMICALS/SUPP,HOUSEHOLD PAPER GOODS,IMPULSE MERCHANDISE,INFANT APPAREL,INFANT CONSUMABLE HARDLINES,JEWELRY AND SUNGLASSES,LADIES SOCKS,LADIESWEAR,LARGE HOUSEHOLD GOODS,LAWN AND GARDEN,"LIQUOR,WINE,BEER",MEAT - FRESH & FROZEN,MEDIA AND GAMING,MENS WEAR,MENSWEAR,Na,OFFICE SUPPLIES,OPTICAL - FRAMES,OPTICAL - LENSES,OTHER DEPARTMENTS,PAINT AND ACCESSORIES,PERSONAL CARE,PETS AND SUPPLIES,PHARMACY OTC,PHARMACY RX,PLAYERS AND ELECTRONICS,PLUS AND MATERNITY,PRE PACKED DELI,PRODUCE,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,BRAS & SHAPEWEAR,CAMERAS AND SUPPLIES,"CANDY, TOBACCO, COOKIES",CELEBRATION,COMM BREAD,CONCEPT STORES,COOK AND DINE,DAIRY,DSD GROCERY,ELECTRONICS,FABRICS AND CRAFTS,FINANCIAL SERVICES,FROZEN FOODS,FURNITURE,"GIRLS WEAR, 4-6X AND 7-14",GROCERY DRY GOODS,HARDWARE,HOME DECOR,HOME MANAGEMENT,HORTICULTURE AND ACCESS,HOUSEHOLD CHEMICALS/SUPP,HOUSEHOLD PAPER GOODS,IMPULSE MERCHANDISE,INFANT APPAREL,INFANT CONSUMABLE HARDLINES,JEWELRY AND SUNGLASSES,LADIES SOCKS,LADIESWEAR,LARGE HOUSEHOLD GOODS,LAWN AND GARDEN,"LIQUOR,WINE,BEER",MEAT - FRESH & FROZEN,MEDIA AND GAMING,MENS WEAR,MENSWEAR,Na,OFFICE SUPPLIES,OPTICAL - FRAMES,OPTICAL - LENSES,OTHER DEPARTMENTS,PAINT AND ACCESSORIES,PERSONAL CARE,PETS AND SUPPLIES,PHARMACY OTC,PHARMACY RX,PLAYERS AND ELECTRONICS,PLUS AND MATERNITY,PRE PACKED DELI,PRODUCE,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,BOOKS AND MAGAZINES,BOYS WEAR,BRAS & SHAPEWEAR,CAMERAS AND SUPPLIES,"CANDY, TOBACCO, COOKIES",CELEBRATION,COMM BREAD,CONCEPT STORES,COOK AND DINE,DAIRY,DSD GROCERY,ELECTRONICS,FABRICS AND CRAFTS,FINANCIAL SERVICES,FROZEN FOODS,FURNITURE,"GIRLS WEAR, 4-6X AND 7-14",GROCERY DRY GOODS,HARDWARE,HOME DECOR,HOME MANAGEMENT,HORTICULTURE AND ACCESS,HOUSEHOLD CHEMICALS/SUPP,HOUSEHOLD PAPER GOODS,IMPULSE MERCHANDISE,INFANT APPAREL,INFANT CONSUMABLE HARDLINES,JEWELRY AND SUNGLASSES,LADIES SOCKS,LADIESWEAR,LARGE HOUSEHOLD GOODS,LAWN AND GARDEN,"LIQUOR,WINE,BEER",MEAT - FRESH & FROZEN,MEDIA AND GAMING,MENS WEAR,MENSWEAR,Na,OFFICE SUPPLIES,OPTICAL - FRAMES,OPTICAL - LENSES,OTHER DEPARTMENTS,PAINT AND ACCESSORIES,PERSONAL CARE,PETS AND SUPPLIES,PHARMACY OTC,PHARMACY RX,PLAYERS AND ELECTRONICS,PLUS AND MATERNITY,PRE PACKED DELI,PRODUCE,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1,Unnamed: 122_level_1,Unnamed: 123_level_1,Unnamed: 124_level_1,Unnamed: 125_level_1,Unnamed: 126_level_1,Unnamed: 127_level_1,Unnamed: 128_level_1,Unnamed: 129_level_1,Unnamed: 130_level_1,Unnamed: 131_level_1,Unnamed: 132_level_1,Unnamed: 133_level_1,Unnamed: 134_level_1,Unnamed: 135_level_1,Unnamed: 136_level_1,Unnamed: 137_level_1,Unnamed: 138_level_1,Unnamed: 139_level_1,Unnamed: 140_level_1,Unnamed: 141_level_1,Unnamed: 142_level_1,Unnamed: 143_level_1,Unnamed: 144_level_1,Unnamed: 145_level_1,Unnamed: 146_level_1,Unnamed: 147_level_1,Unnamed: 148_level_1,Unnamed: 149_level_1,Unnamed: 150_level_1,Unnamed: 151_level_1,Unnamed: 152_level_1,Unnamed: 153_level_1,Unnamed: 154_level_1,Unnamed: 155_level_1,Unnamed: 156_level_1,Unnamed: 157_level_1,Unnamed: 158_level_1,Unnamed: 159_level_1,Unnamed: 160_level_1,Unnamed: 161_level_1,Unnamed: 162_level_1,Unnamed: 163_level_1,Unnamed: 164_level_1,Unnamed: 165_level_1,Unnamed: 166_level_1,Unnamed: 167_level_1,Unnamed: 168_level_1,Unnamed: 169_level_1,Unnamed: 170_level_1,Unnamed: 171_level_1,Unnamed: 172_level_1,Unnamed: 173_level_1,Unnamed: 174_level_1,Unnamed: 175_level_1,Unnamed: 176_level_1,Unnamed: 177_level_1,Unnamed: 178_level_1,Unnamed: 179_level_1,Unnamed: 180_level_1,Unnamed: 181_level_1,Unnamed: 182_level_1,Unnamed: 183_level_1,Unnamed: 184_level_1,Unnamed: 185_level_1,Unnamed: 186_level_1,Unnamed: 187_level_1,Unnamed: 188_level_1,Unnamed: 189_level_1,Unnamed: 190_level_1,Unnamed: 191_level_1,Unnamed: 192_level_1,Unnamed: 193_level_1,Unnamed: 194_level_1,Unnamed: 195_level_1,Unnamed: 196_level_1,Unnamed: 197_level_1,Unnamed: 198_level_1,Unnamed: 199_level_1,Unnamed: 200_level_1,Unnamed: 201_level_1,Unnamed: 202_level_1,Unnamed: 203_level_1,Unnamed: 204_level_1,Unnamed: 205_level_1,Unnamed: 206_level_1,Unnamed: 207_level_1,Unnamed: 208_level_1,Unnamed: 209_level_1,Unnamed: 210_level_1,Unnamed: 211_level_1,Unnamed: 212_level_1,Unnamed: 213_level_1,Unnamed: 214_level_1,Unnamed: 215_level_1,Unnamed: 216_level_1,Unnamed: 217_level_1,Unnamed: 218_level_1,Unnamed: 219_level_1,Unnamed: 220_level_1,Unnamed: 221_level_1,Unnamed: 222_level_1,Unnamed: 223_level_1,Unnamed: 224_level_1,Unnamed: 225_level_1,Unnamed: 226_level_1,Unnamed: 227_level_1,Unnamed: 228_level_1,Unnamed: 229_level_1,Unnamed: 230_level_1,Unnamed: 231_level_1,Unnamed: 232_level_1,Unnamed: 233_level_1,Unnamed: 234_level_1,Unnamed: 235_level_1,Unnamed: 236_level_1,Unnamed: 237_level_1,Unnamed: 238_level_1,Unnamed: 239_level_1,Unnamed: 240_level_1,Unnamed: 241_level_1,Unnamed: 242_level_1,Unnamed: 243_level_1,Unnamed: 244_level_1,Unnamed: 245_level_1,Unnamed: 246_level_1,Unnamed: 247_level_1,Unnamed: 248_level_1,Unnamed: 249_level_1,Unnamed: 250_level_1,Unnamed: 251_level_1,Unnamed: 252_level_1,Unnamed: 253_level_1,Unnamed: 254_level_1,Unnamed: 255_level_1,Unnamed: 256_level_1,Unnamed: 257_level_1,Unnamed: 258_level_1,Unnamed: 259_level_1,Unnamed: 260_level_1,Unnamed: 261_level_1,Unnamed: 262_level_1,Unnamed: 263_level_1,Unnamed: 264_level_1,Unnamed: 265_level_1,Unnamed: 266_level_1,Unnamed: 267_level_1,Unnamed: 268_level_1,Unnamed: 269_level_1,Unnamed: 270_level_1,Unnamed: 271_level_1,Unnamed: 272_level_1
191232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.087302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191277,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.087302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010048,0.0,0.0,0.0
191283,0.087302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.087302,0.0,0.0,0.0,0.0,0.087302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.087302,0.0,0.0,0.0,0.0,0.087302,0.0,0.087302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038328,0.0,0.0,0.0,0.0,0.038328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038328,0.0,0.0,0.0,0.0,0.038328,0.0,0.038328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010909,0.0,0.0,0.0,0.0,0.010909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010909,0.0,0.0,0.0,0.0,0.010909,0.0,0.010909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010048,0.0,0.0,0.0,0.0,0.010048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010048,0.0,0.0,0.0,0.0,0.010048,0.0,0.010048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.087302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076655,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
191312,0.0,0.0,0.0,0.0,0.0,0.087302,0.0,0.0,0.0,0.087302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174603,0.0,0.0,0.0,0.0,0.0,0.038328,0.0,0.0,0.0,0.038328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076655,0.0,0.0,0.0,0.0,0.0,0.010909,0.0,0.0,0.0,0.010909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021818,0.0,0.0,0.0,0.0,0.0,0.010048,0.0,0.0,0.0,0.010048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020096


In [28]:
x_test3.info(), x_test3.columns

<class 'pandas.core.frame.DataFrame'>
UInt64Index: 4500 entries, 12 to 191312
Columns: 272 entries, 1-HR PHOTO to WIRELESS
dtypes: float64(272)
memory usage: 9.4 MB


(None,
 Index(['1-HR PHOTO', 'ACCESSORIES', 'AUTOMOTIVE', 'BAKERY', 'BATH AND SHOWER',
        'BEAUTY', 'BEDDING', 'BOOKS AND MAGAZINES', 'BOYS WEAR',
        'BRAS & SHAPEWEAR',
        ...
        'SEAFOOD', 'SEASONAL', 'SERVICE DELI', 'SHEER HOSIERY', 'SHOES',
        'SLEEPWEAR/FOUNDATIONS', 'SPORTING GOODS', 'SWIMWEAR/OUTERWEAR', 'TOYS',
        'WIRELESS'],
       dtype='object', name='DepartmentDescription', length=272))

In [32]:
X_train, X_test, y_train, y_test = train_test_split(x_test3, y2, random_state=0)
X_samp, y_samp = RandomOverSampler(random_state=0).fit_sample(X_train, y_train)

In [33]:
# random over
rf = RandomForestClassifier(n_estimators=500, n_jobs=-1, criterion='gini', max_depth=3, )
rf.fit(X_samp, y_samp)
print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.15      0.15      0.15       142
         2.0       0.31      0.03      0.06       472
         3.0       0.20      0.24      0.22       215
         4.0       0.27      0.62      0.38       296

   micro avg       0.24      0.24      0.24      1125
   macro avg       0.23      0.26      0.20      1125
weighted avg       0.26      0.24      0.19      1125



In [35]:
# extrem forest
er = ExtraTreesClassifier(n_jobs=-1, n_estimators=250, random_state=0)
er.fit(X_samp, y_samp)

print(classification_report(y_test, er.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.10      0.12      0.11       142
         2.0       0.42      0.60      0.49       472
         3.0       0.23      0.13      0.16       215
         4.0       0.25      0.15      0.19       296

   micro avg       0.33      0.33      0.33      1125
   macro avg       0.25      0.25      0.24      1125
weighted avg       0.30      0.33      0.30      1125



In [188]:
import lightgbm

gbm5 = lightgbm.LGBMClassifier(n_estimators=300, max_depth=3, random_state=0)
gbm5.fit(X_samp, y_samp)
print(classification_report(y_test, gbm5.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.14      0.37      0.20       142
         2.0       0.42      0.27      0.33       472
         3.0       0.19      0.18      0.19       215
         4.0       0.31      0.24      0.27       296

   micro avg       0.26      0.26      0.26      1125
   macro avg       0.26      0.27      0.25      1125
weighted avg       0.31      0.26      0.27      1125



In [192]:
# linear regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=0.1)
lr.fit(X_samp, y_samp)
print(classification_report(y_test, lr.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.12      0.47      0.19       142
         2.0       0.41      0.15      0.22       472
         3.0       0.18      0.16      0.17       215
         4.0       0.28      0.17      0.21       296

   micro avg       0.20      0.20      0.20      1125
   macro avg       0.24      0.24      0.20      1125
weighted avg       0.29      0.20      0.20      1125



In [197]:
knn = KNeighborsClassifier(n_neighbors = 5, n_jobs=-1)
knn.fit(X_samp,y_samp)
print(classification_report(y_test, knn.predict(X_test)))

              precision    recall  f1-score   support

         1.0       0.13      0.25      0.17       142
         2.0       0.43      0.38      0.40       472
         3.0       0.18      0.22      0.20       215
         4.0       0.31      0.17      0.22       296

   micro avg       0.28      0.28      0.28      1125
   macro avg       0.26      0.26      0.25      1125
weighted avg       0.31      0.28      0.29      1125



In [224]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=30000, min_samples_leaf=1)
tree.fit(X_samp,y_samp)
print(tree.score(X_samp,y_samp))
print(classification_report(y_test, tree.predict(X_test)))

0.8728354978354979
              precision    recall  f1-score   support

         1.0       0.10      0.20      0.14       142
         2.0       0.41      0.31      0.35       472
         3.0       0.21      0.23      0.22       215
         4.0       0.25      0.21      0.23       296

   micro avg       0.26      0.26      0.26      1125
   macro avg       0.24      0.24      0.23      1125
weighted avg       0.29      0.26      0.27      1125



---

# submission data

In [2]:
df_test = pd.read_csv("test.csv")

reducer = Reducer()
df_test = reducer.reduce(df_test)
df_test.drop(['Weekday', 'Upc', 'FinelineNumber'], inplace=True, axis=1)

c = df_test['DepartmentDescription'].isnull()
df_test.loc[c, 'DepartmentDescription'] = 'Na'
df_test['sum'] = df.assign(f=df.groupby(['VisitNumber', 'DepartmentDescription'])['ScanCount'].transform(sum))['f']

df_s1 = df_test.iloc[:, [0,2, -1]]
df_s1 = df_test.drop_duplicates(['VisitNumber', 'DepartmentDescription']).reset_index(drop=True)

temp1 = df_test.copy()

for i in ls:
    c = sub['TripType'] == i
    temp1 = temp1.merge(sub[c].iloc[:, [1, -1]], how='outer',on='DepartmentDescription')
    col_name = "{}_sum".format(i)
    temp1.loc[:, col_name] = temp1['sum'] * temp1['minmax']
    temp1.drop('minmax', axis=1, inplace=True)
test1 = temp1[:-1].fillna(-0.000001).groupby('VisitNumber',).agg(sum).iloc[:, 2:]

test_x = test1.merge(df_s1.pivot('VisitNumber', 'DepartmentDescription', 'sum').fillna(0), on='VisitNumber')

In [3]:
not_in_train = [i for i in test_x.columns if i not in x.columns]

not_in_test = [i for i in x.columns if i not in test_x.columns]

In [8]:
pred = er.predict(test_x)

In [20]:
pred_proba = er.predict_proba(test_x)

In [29]:
ans = pd.concat([df_test['VisitNumber'].drop_duplicates().reset_index(drop=True), pd.Series(pred)], axis=1)
# ans.to_csv("clf_ans.csv", header=True, index=False)

In [47]:
# result = lightgbm_model.predict(total_test)
samplesub = pd.read_csv('sample_submission.csv')
subform_df_columns = samplesub.columns[1:]
result_df = pd.DataFrame(pred_proba)
# result_df.columns = subform_df_columns
# subform_df = pd.concat([test_x.index.reset_index()['VisitNumber'], result_df], axis=1)
# subform_df.set_index('VisitNumber', inplace=True)
# subform_df.tail()

In [63]:
proba = pd.merge(pd.DataFrame(pred_proba, test_x.index, subform_df_columns[-5:]), ans, on='VisitNumber')

In [78]:
c = proba[0] != 5 
ans2 = proba[c].drop(0, axis=1)
ans2.head()

Unnamed: 0,VisitNumber,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
181,368.0,0.0,0.782258,0.0,0.0,0.217742
220,455.0,0.0,0.947368,0.0,0.0,0.052632
520,1051.0,0.008,0.564,0.0,0.06,0.368
603,1226.0,0.0,0.782258,0.0,0.0,0.217742
736,1492.0,0.0,0.867647,0.0,0.0,0.132353


In [79]:
ans1 = pd.read_csv("clf.csv")
ans1.head()

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_32,TripType_33,TripType_34,TripType_35,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_999
0,1,9e-06,9.708875e-07,1.9e-05,1.7e-05,0.000702,0.007658,0.001035,9.882957e-07,2.652647e-09,...,2.4e-05,1.6e-05,2.07398e-06,0.000139,3e-05,0.000109,0.147647,0.006152,2.3e-05,0.001506
1,2,0.000297,3.32914e-05,0.000732,0.001019,0.044845,0.035609,0.11024,4.403765e-05,1.114693e-07,...,0.000719,0.000584,6.89876e-05,0.389547,0.001044,0.003488,0.03741,0.24981,0.000736,0.046874
2,3,1e-06,1.268747e-07,2.5e-05,3e-06,5e-06,0.000234,0.000133,1.210448e-07,3.954742e-10,...,3e-06,4e-06,1.251117e-06,8e-06,0.000292,2e-06,3e-06,1e-05,3e-06,0.999247
3,4,3.3e-05,3.859514e-06,9.1e-05,0.000117,0.000159,0.029728,0.938973,3.437097e-06,1.339275e-08,...,4.9e-05,9.9e-05,7.60162e-06,0.000273,9.7e-05,7.3e-05,9.5e-05,0.000238,7.8e-05,0.012868
4,6,1e-06,1.155458e-07,3e-06,3e-06,5e-06,9.4e-05,0.00011,1.027682e-07,3.635016e-10,...,2e-06,3e-06,2.360019e-07,7e-06,3e-06,2e-06,2e-06,9e-06,2e-06,0.999637


In [93]:
ans_final = pd.merge(ans1, ans2, how='outer').fillna(0).sort_values('VisitNumber')
col_order = ['VisitNumber', 'TripType_3', 'TripType_4', 'TripType_5', 'TripType_6',
       'TripType_7', 'TripType_8', 'TripType_9', 'TripType_12', 'TripType_14',
       'TripType_15', 'TripType_18', 'TripType_19', 'TripType_20',
       'TripType_21', 'TripType_22', 'TripType_23', 'TripType_24',
       'TripType_25', 'TripType_26', 'TripType_27', 'TripType_28',
       'TripType_29', 'TripType_30', 'TripType_31', 'TripType_32',
       'TripType_33', 'TripType_34', 'TripType_35', 'TripType_36',
       'TripType_37', 'TripType_38', 'TripType_39', 'TripType_40',
       'TripType_41', 'TripType_42', 'TripType_43', 'TripType_44',
       'TripType_999',]

In [104]:
ans_final1 = ans_final.reindex(col_order, axis=1)
ans_final1.to_csv('submission1.csv', index=False)

In [14]:
pd.read_csv("lightgbm_submission__plz_.csv")

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,3.955246e-07,2.393574e-07,0.000016,6.740869e-06,3.915994e-04,0.017250,0.001274,3.897092e-07,1.813186e-10,...,0.000034,9.021664e-05,2.031978e-01,0.006986,2.088225e-05,1.198758e-03,0.003002,6.715756e-04,2.930259e-05,0.000377
1,2,1.961376e-05,2.466022e-05,0.000288,4.330782e-04,4.824257e-02,0.105352,0.062787,1.719776e-05,5.171409e-09,...,0.001116,2.435409e-03,3.247026e-02,0.163575,5.082265e-04,1.640355e-03,0.010981,1.294181e-02,9.645004e-04,0.065446
2,3,5.591743e-08,1.766555e-08,0.000009,6.747341e-07,2.304251e-06,0.000230,0.000015,9.037669e-09,8.603149e-12,...,0.000335,5.814609e-07,6.725829e-07,0.000005,7.247595e-07,4.485091e-07,0.000002,8.309179e-07,4.587562e-07,0.999387
3,4,2.437162e-06,1.081802e-06,0.000056,7.854690e-05,5.731377e-05,0.025484,0.952119,4.058416e-07,1.161909e-07,...,0.000104,3.004935e-05,3.952971e-05,0.000141,3.134093e-05,2.014869e-05,0.000241,1.675864e-05,1.396139e-05,0.011165
4,6,4.349115e-08,1.214039e-08,0.000001,3.453322e-07,2.100275e-06,0.000035,0.000056,8.043111e-09,7.399318e-12,...,0.000001,2.930714e-07,4.614136e-07,0.000002,5.615977e-07,7.852346e-07,0.000003,3.781401e-07,5.945104e-07,0.999872
5,13,4.941331e-06,2.426457e-06,0.000127,1.268355e-04,5.571771e-01,0.418793,0.008543,2.148106e-06,9.634582e-10,...,0.000197,6.004396e-04,9.865069e-04,0.000521,1.117880e-04,3.590949e-05,0.000224,5.862681e-05,4.126768e-05,0.003715
6,14,3.339726e-06,7.347439e-07,0.000037,3.429469e-05,1.172735e-02,0.000087,0.000189,1.151929e-06,8.906270e-10,...,0.000091,2.346614e-04,7.288160e-04,0.618560,8.467364e-03,3.787501e-03,0.016008,7.562847e-02,1.574265e-01,0.000552
7,16,1.702769e-06,7.619371e-07,0.000047,1.376721e-04,2.299567e-03,0.000100,0.000060,1.012033e-06,1.086472e-08,...,0.000088,1.780775e-02,2.080131e-03,0.076675,8.333888e-03,6.661314e-05,0.000966,2.698070e-03,3.781794e-04,0.000500
8,18,6.496934e-06,4.417823e-06,0.000197,1.758730e-04,5.235805e-02,0.001723,0.000366,4.562613e-06,2.129310e-09,...,0.000957,3.522561e-02,1.489359e-02,0.554796,3.102163e-03,4.663844e-04,0.001061,4.288360e-03,1.807526e-03,0.001670
9,21,5.073983e-06,3.828919e-04,0.030413,8.960265e-05,2.743357e-03,0.000741,0.001071,3.240752e-06,1.807996e-09,...,0.641832,6.313079e-04,1.521931e-03,0.218488,1.818301e-03,1.044216e-03,0.002477,2.239195e-02,1.106635e-02,0.006240


In [15]:
X_train

Unnamed: 0_level_0,999_sum,30_sum,26_sum,8_sum,35_sum,41_sum,21_sum,6_sum,42_sum,7_sum,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
119203,4.422012,0.077324,0.253599,0.027933,0.014854,1.785714,0.009965,0.077176,2.927526,0.058294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
163522,0.524052,0.045178,0.032115,-0.000001,0.007914,0.103175,0.015826,0.012315,0.082927,0.014405,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
182376,0.467930,0.028671,0.017719,0.041301,0.018507,0.105820,0.007620,0.046798,0.073868,0.890164,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9708,5.209184,0.985229,1.188261,2.100559,1.366249,4.055556,4.165885,0.690476,5.515679,1.055818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
135974,4.298105,0.846221,0.683278,2.445531,0.616218,2.537037,0.192849,0.845649,2.391638,4.949359,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30780,1.389213,0.807993,0.532669,2.540702,2.024717,2.756614,0.225674,0.761084,1.958885,1.096556,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18667,0.529155,0.038228,0.022148,0.089385,0.027274,0.129630,0.021102,0.083744,0.098955,0.657439,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87850,1.795190,0.317984,0.348837,1.182761,0.284184,0.642857,0.079132,0.354680,0.727526,2.140446,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
158706,8.534985,1.154648,1.218162,3.390263,0.796420,5.158730,0.323564,3.176519,5.072474,6.568535,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
154074,1.322886,0.454387,0.366556,1.332402,1.112383,1.687831,0.120750,0.519704,1.446690,1.537025,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
