In [48]:
import pandas as pd
import numpy as np

In [49]:
# read raw data
df_train = pd.read_csv('../data/BADS_WS1718_known.csv', sep=',', na_values=['?', 'not_reported'])
df_test = pd.read_csv('../data/BADS_WS1718_class.csv', sep=',', na_values=['?', 'not_reported'])

In [50]:
# just to prove pythons superiority compared to R
#for col in df_train.columns:
#    print(df_train.groupby(col)['order_item_id'].nunique())

In [51]:
#for col in df_test.columns:
#    print(df_test.groupby(col)['order_item_id'].nunique())

In [52]:
# Data Preprocessing

In [6]:
# 1. Drop order_item_id -> no value
df_train = df_train.drop(['order_item_id'], axis=1)
df_test = df_test.drop(['order_item_id'], axis=1)

In [7]:
# 2. Convert date columns to datetime format:  order_date, delivery-date, user_reg_date, user_dob

In [8]:
date_columns = ['order_date', 'delivery_date', 'user_reg_date', 'user_dob']
for date in date_columns:
    df_train[date] = pd.to_datetime(df_train[date], format="%Y-%m-%d")

In [9]:
for date in date_columns:
    df_test[date] = pd.to_datetime(df_test[date], format="%Y-%m-%d")

In [10]:
# 3. Calculate additional columns
# 3.1 Length of customership  (LOC) (order_date - user_reg_date )
df_train['LOC'] = (df_train.order_date - df_train.user_reg_date) / np.timedelta64(1, 'D')
df_test['LOC'] = (df_test.order_date - df_test.user_reg_date) / np.timedelta64(1, 'D')

In [11]:
#3.2 age of the customers df['age'] at the time of order (order_date - user_dob)
df_train['age'] = np.round((df_train.order_date - df_train.user_dob) / np.timedelta64(1, 'Y'))
df_test ['age'] = np.round((df_test.order_date - df_test.user_dob) / np.timedelta64(1, 'Y'))

In [12]:
#3.3 delivery-time (DT) (order_date - delivery_date)
df_train['DT'] = (df_train.delivery_date - df_train.order_date) / np.timedelta64(1, 'D')
df_test['DT'] = (df_test.delivery_date - df_test.order_date) / np.timedelta64(1, 'D')

In [13]:
#3.4 Number of past purchases (NPP)
past_purchases = df_train.user_id.value_counts()
df_train['npp'] = 0
npp = []
for elem in zip(df_train.user_id, df_train.npp):
    npp.append(past_purchases[elem[0]])
df_train.npp = npp

past_purchases = df_test.user_id.value_counts()
df_test['npp'] = 0
npp = []
for elem in zip(df_test.user_id, df_test.npp):
    npp.append(past_purchases[elem[0]])
df_test.npp = npp

In [14]:
# 3.5 Return rate of the customer
# Update: NEVER use the target variable in feature engineering because it messes up the whole model
# df_train.groupby('user_id')['return'].sum() --> I don't include it because it would not be independent from the npp column

In [15]:
# 3.6 split order_date, delivery_date into --> *_month, *year

In [16]:
df_train['order_month'], df_train['order_year'] = df_train.order_date.dt.month, df_train.order_date.dt.year 
df_test['order_month'], df_test['order_year'] = df_test.order_date.dt.month, df_test.order_date.dt.year
df_train['delivery_month'], df_train['delivery_year'] = df_train.delivery_date.dt.month, df_train.delivery_date.dt.year
df_test['delivery_month'], df_test['delivery_year'] = df_test.delivery_date.dt.month, df_train.delivery_date.dt.year

In [17]:
df_train.columns

Index(['order_date', 'delivery_date', 'item_id', 'item_size', 'item_color',
       'brand_id', 'item_price', 'user_id', 'user_title', 'user_dob',
       'user_state', 'user_reg_date', 'return', 'LOC', 'age', 'DT', 'npp',
       'order_month', 'order_year', 'delivery_month', 'delivery_year'],
      dtype='object')

In [18]:
# Drop now obsolete columns
df_train = df_train.drop(['order_date', 'delivery_date', 'user_dob', 'user_reg_date'], axis=1)
df_test =  df_test.drop(['order_date', 'delivery_date', 'user_dob', 'user_reg_date'], axis=1)

In [19]:
# # useful helper
# with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
#     print(#whatever)

In [20]:
# 5. Strategy for columns with a lot of different class labels
# 5.1 item_id --> 3 Categories: popular item (bought >= 300 times), normal item (300 > x > 50), rare item (50 >= num_bought)
def categorize_item_id(x, item_id_counts):
    x = item_id_counts[x]
    
    if x > 300:
        x = 'popular'
    elif x > 50:
        x = 'normal'
    else:
        x = 'rare'
        
    return x

item_id_train_counts = df_train.item_id.value_counts()
item_id_test_counts = df_test.item_id.value_counts()

df_train['item_popularity'] = df_train.item_id.apply(lambda x: categorize_item_id(x, item_id_train_counts))
df_test['item_popularity'] = df_test.item_id.apply(lambda x: categorize_item_id(x, item_id_test_counts))

In [24]:
# 5.2 item_size --> join big letters and small letters into same group if the same (L + l); popular normal rare
def categorize_item_size(x, item_size_count):
        
    count = item_size_count[x]
    if count > 2000:
        x = 'popular'
    elif count > 500:
        x = 'normal'
    else:
        x = 'rare'
    return x

df_train.item_size = df_train.item_size.apply(lambda x: x.lower())
df_test.item_size = df_test.item_size.apply(lambda x: x.lower())

item_size_train_counts = df_train.item_size.value_counts()
item_size_test_counts = df_test.item_size.value_counts()

df_train.item_size = df_train.item_size.apply(lambda x: categorize_item_size(x, item_size_train_counts))
df_test.item_size = df_test.item_size.apply(lambda x: categorize_item_size(x, item_size_test_counts))

In [25]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 3):
    print(df_train.item_color.value_counts())

black             17896
blue              10141
grey               8703
red                8158
brown              6849
green              6662
white              4027
purple             3723
petrol             3338
ocher              2404
olive              2335
denim              2113
berry              2030
anthracite         2023
stained            1952
mocca              1798
pink               1377
aquamarine         1207
aubergine          1052
ash                 989
orange              829
ecru                822
turquoise           755
beige               657
dark denim          652
nature              637
magenta             627
iron                459
terracotta          457
bordeaux            436
azure               415
navy                369
khaki               369
yellow              323
coral               313
basalt              310
pallid              309
blau                283
curry               222
graphite            186
hibiscus            144
cognac          

In [26]:
# 5.3 item_color --> popular normal rare
def categorize_item_color(x, item_color_count):
    
    count = item_color_count[x]
    if count > 4000:
        x = 'popular'
    elif count > 300:
        x = 'normal'
    else:
        x = 'rare'

    return x

# Convert NaNs to other
df_train.item_color = df_train.item_color.fillna('other')
df_test.item_color = df_test.item_color.fillna('other')

item_color_train_counts = df_train.item_color.value_counts()
item_color_test_counts = df_test.item_color.value_counts()

df_train.item_color = df_train.item_color.apply(lambda x: categorize_item_color(x, item_color_train_counts))
df_test.item_color = df_test.item_color.apply(lambda x: categorize_item_color(x, item_color_test_counts))

In [27]:
# 5.4 brand_id --> if count > 2000 : popular; 2000 >= count > 500 : normal; Rest --> rare
def categorize_brand_id(x, brand_id_count):
    
    count = brand_id_count[x]
    
    if count > 2000:
        x = 'popular'
    elif count > 500:
        x = 'normal'
    else:
        x = 'rare'
        
    return x

brand_id_train_counts = df_train.brand_id.value_counts()
brand_id_test_counts = df_test.brand_id.value_counts()

df_train['brand_popularity'] = df_train.brand_id.apply(lambda x: categorize_brand_id(x, brand_id_train_counts))
df_test['brand_popularity'] = df_test.brand_id.apply(lambda x: categorize_brand_id(x, brand_id_test_counts))

In [28]:
# 5.5 delivery_month --> make nas to 'other'
df_train.delivery_month = df_train.delivery_month.fillna('other')
df_test.delivery_month = df_test.delivery_month.fillna('other')

In [29]:
# 5.6 delivery_year --> make nas to other
df_train.delivery_year = df_train.delivery_year.fillna('other')
df_test.delivery_year = df_test.delivery_year.fillna('other')

In [30]:
# 5.6 user_id --> nothing really that one could do here, since we already got number of purchases

In [31]:
# Again drop now unnecessary columns
df_train = df_train.drop(['item_id', 'brand_id', 'user_id'], axis=1)
df_test =  df_test.drop(['item_id', 'brand_id', 'user_id'], axis=1)

In [32]:
### All columns are fine now in terms of data type and structure ###

### Check columns for plausability ###

# item_size                      | Train: Okay | Test: Okay
# item_color                     | Train: Okay | Test: Okay
# item_price                     --> Zero values, and two 999 values. Cant for sure be outliers --> stay; Rest okay
# user_title                     | Train: Okay | Test: Okay
# user_state                     | Train: Okay | Test: Okay
# LOC                            | Train: Okay | Test: Okay
# age                            | Train: needs work | Test: needs work --> Some are older than 100 and  smaller 15 !! --FIXED--
# DT                             | Train: work | Test: work --> bunch of negative delivery times   --FIXED--
# npp                            | Train: Okay | Test: Okay 
# order_month                    | Train: Okay | Test: Okay
# order_year                     | Train: Okay | Test: Okay
# delivery_month                 | Train: Okay | Test: Okay --> NaNs bei beiden --FIXED--
# delivery_year                  | Train: Okay | Test: Okay --> NaNs bei beiden  und 1990 (unsinnig) --FIXED--
# item_popularity                | Train: Okay | Test: Okay
# brand_popularity               | Train: Okay | Test: Okay

In [33]:
# 1. fix age column: Every age > 80 == 80; and every age < 18 == 18
def fix_age(x):
    if x > 80:
        x = 80
    elif x < 18:
        x = 18
    return x

# Fill NAs with median
median_train_age = df_train.age.median()
median_test_age = df_test.age.median()

df_train.age = df_train.age.fillna(median_train_age)
df_test.age = df_test.age.fillna(median_test_age)

df_train.age = df_train.age.apply(lambda x: fix_age(x))
df_test.age = df_test.age.apply(lambda x: fix_age(x))

In [34]:
# 2.fix DT column: negative values come from 1990 delivery time --> impute to median delivery_time (mean from positive values)
def fix_DT(x, mean_DT):
    if x < 0:
        x = mean_DT
    return x

median_train_DT = df_train.DT[df_train.DT >= 0].median()
median_test_DT = df_test.DT[df_test.DT >= 0].median()

#Fill NA DTs with median
df_train.DT = df_train.DT.fillna(median_train_DT)
df_test.DT = df_test.DT.fillna(median_test_DT)

df_train.DT = df_train.DT.apply(lambda x: fix_DT(x, median_train_DT))
df_test.DT = df_test.DT.apply(lambda x: fix_DT(x, median_test_DT))

In [35]:
# 3. fix delivery_year --> 1990 values will just be smoothed to 2012
def fix_delivery_year(x):
    if x == 'other':
        return x
    if x < 2012:
        x = 2012
    return x

df_train.delivery_year = df_train.delivery_year.apply(lambda x: fix_delivery_year(x))
df_test.delivery_year = df_test.delivery_year.apply(lambda x: fix_delivery_year(x))

In [36]:
#### Data is correct and clean ###

In [37]:
df_train.columns

Index(['item_size', 'item_color', 'item_price', 'user_title', 'user_state',
       'return', 'LOC', 'age', 'DT', 'npp', 'order_month', 'order_year',
       'delivery_month', 'delivery_year', 'item_popularity',
       'brand_popularity'],
      dtype='object')

In [38]:
# test = pd.get_dummies(ord_test)
# test.columns
ord_test = pd.DataFrame()
ord_test['order_month'] = df_test['order_month'].astype('category')

In [39]:
### Encode categorical values
# 1. get variables that need to be 1-hot-encoded: item_size, item_color, user_title, user_state, order_month, order_year
# ... delivery_month, delivery_year, item_popularity, brand_popularity
cat_train = df_train[['item_size', 'item_color', 'user_title', 'user_state',
                     'delivery_month', 'delivery_year', 'item_popularity', 'brand_popularity']]

cat_test = df_test[['item_size', 'item_color', 'user_title', 'user_state',
                     'delivery_month', 'delivery_year', 'item_popularity', 'brand_popularity']]

ord_train = pd.DataFrame()
ord_train['order_month'] = df_train['order_month'].astype('category').copy()
ord_train['order_year'] = df_train['order_year'].astype('category').copy()

ord_test = pd.DataFrame()
ord_test['order_month'] = df_test['order_month'].astype('category').copy()
ord_test['order_year'] = df_test['order_year'].astype('category').copy()

cat_train['order_month'] = ord_train.order_month
cat_train['order_year'] = ord_train.order_year

cat_test['order_month'] = ord_test.order_month
cat_test['order_year'] = ord_test.order_year

cat_train = pd.get_dummies(cat_train)
cat_test = pd.get_dummies(cat_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [40]:
df_test.shape

(50000, 15)

In [41]:
df_train.columns

Index(['item_size', 'item_color', 'item_price', 'user_title', 'user_state',
       'return', 'LOC', 'age', 'DT', 'npp', 'order_month', 'order_year',
       'delivery_month', 'delivery_year', 'item_popularity',
       'brand_popularity'],
      dtype='object')

In [42]:
# get continouus values
cont_train = df_train[['item_price', 'LOC', 'age', 'DT', 'npp']].copy()
cont_test = df_test[['item_price', 'LOC', 'age', 'DT', 'npp']].copy()

#standardize them
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(cont_train)
cont_train = pd.DataFrame(sc.transform(cont_train))
cont_test = pd.DataFrame(sc.transform(cont_test))


#join them back together
X_train = pd.concat([cat_train, cont_train], axis=1)
y_train = df_train['return']

X_test = pd.concat([cat_test, cont_test], axis=1)

In [45]:
## Save X_train, y_train and X_test to a file
X_train.to_csv('../data/X_train.csv', index=False)
y_train.to_csv('../data/y_train.csv', index=False)
X_test.to_csv('../data/X_test.csv', index=False)

In [47]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=2000.0, random_state=0)
lr.fit(X_train.values, y_train.values)
y_predict = lr.predict(X_train.values)
error = sum(abs(y_train.values - y_predict))
print('Error: {}'.format(error))

X_train.shape

Error: 37550


(100000, 68)

In [82]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_jobs=-1, random_state=0)
clf.fit(X_train.values, y_train.values)
y_predict = clf.predict(X_train.values)
error = sum(abs(y_train.values - y_predict))
print('Error: {}'.format(error))

X_train.shape

Error: 1470


(100000, 94)

In [83]:
from sklearn.cross_validation import StratifiedKFold
kfold = StratifiedKFold(y=y_train.values, n_folds = 10, random_state = 1)
X_train_val = X_train.values
X_test_val = X_test.values
y_train_val = y_train.values

scores = []
for k, (train, test) in enumerate(kfold):
    clf.fit(X_train_val[train], y_train_val[train])
    score = clf.score(X_train_val[test], y_train_val[test])
    scores.append(score)
    print('Fold {}: Class dist: {},  Acc: {}'.format(k+1, np.bincount(y_train_val[train]), score))

Fold 1: Class dist: [46643 43356],  Acc: 0.6035396460353964
Fold 2: Class dist: [46643 43356],  Acc: 0.6085391460853915
Fold 3: Class dist: [46643 43356],  Acc: 0.6121387861213878
Fold 4: Class dist: [46643 43356],  Acc: 0.5932406759324068
Fold 5: Class dist: [46643 43357],  Acc: 0.6064
Fold 6: Class dist: [46643 43357],  Acc: 0.6137
Fold 7: Class dist: [46644 43357],  Acc: 0.6092609260926093
Fold 8: Class dist: [46644 43357],  Acc: 0.605960596059606
Fold 9: Class dist: [46644 43357],  Acc: 0.606960696069607
Fold 10: Class dist: [46644 43357],  Acc: 0.6031603160316031


In [84]:
scores = []
for k, (train, test) in enumerate(kfold):
    lr.fit(X_train_val[train], y_train_val[train])
    score = lr.score(X_train_val[test], y_train_val[test])
    scores.append(score)
    print('Fold {}: Class dist: {},  Acc: {}'.format(k+1, np.bincount(y_train_val[train]), score))

Fold 1: Class dist: [46643 43356],  Acc: 0.6279372062793721
Fold 2: Class dist: [46643 43356],  Acc: 0.6308369163083691
Fold 3: Class dist: [46643 43356],  Acc: 0.6303369663033697
Fold 4: Class dist: [46643 43356],  Acc: 0.6304369563043696
Fold 5: Class dist: [46643 43357],  Acc: 0.6213
Fold 6: Class dist: [46643 43357],  Acc: 0.6245
Fold 7: Class dist: [46644 43357],  Acc: 0.6277627762776278
Fold 8: Class dist: [46644 43357],  Acc: 0.6216621662166216
Fold 9: Class dist: [46644 43357],  Acc: 0.6312631263126313
Fold 10: Class dist: [46644 43357],  Acc: 0.626962696269627


In [44]:
# simple pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
pipe_lr = Pipeline([['sc', StandardScaler()], ['lr', lr]])

scores = []
for k, (train, test) in enumerate(kfold):
    pipe_lr.fit(X_train_val[train], y_train_val[train])
    score = pipe_lr.score(X_train_val[test], y_train_val[test])
    scores.append(score)
    print('Fold {}: Class dist: {},  Acc: {}'.format(k+1, np.bincount(y_train_val[train]), score))

Fold 1: Class dist: [46643 43356],  Acc: 0.6279372062793721
Fold 2: Class dist: [46643 43356],  Acc: 0.6307369263073692
Fold 3: Class dist: [46643 43356],  Acc: 0.6304369563043696
Fold 4: Class dist: [46643 43356],  Acc: 0.6304369563043696
Fold 5: Class dist: [46643 43357],  Acc: 0.6213
Fold 6: Class dist: [46643 43357],  Acc: 0.6245
Fold 7: Class dist: [46644 43357],  Acc: 0.6277627762776278
Fold 8: Class dist: [46644 43357],  Acc: 0.6216621662166216
Fold 9: Class dist: [46644 43357],  Acc: 0.6312631263126313
Fold 10: Class dist: [46644 43357],  Acc: 0.626962696269627


In [49]:
# Compare LR, against DT and Forest and  Bayes and majority_vote  
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB

lr = LogisticRegression(penalty='l2', C=0.01, random_state=0)
dt = DecisionTreeClassifier(max_depth=3, criterion='entropy', random_state=0)
# knn = KNeighborsClassifier(n_neighbors = 3, p=2, metric='minkowski') -- 
forest = RandomForestClassifier(n_jobs=-1, random_state=0)
bayes = GaussianNB()
maj_vote = VotingClassifier(estimators=[('lr', lr), ('dt', dt),('bayes', bayes), ('forest', forest)], voting='soft', n_jobs=-1)

clf_labels = ['Logistic Regression', 'Decision Tree', 'Forest', 'Bayes', 'Majority']

for clf, label in zip([lr, dt, forest, bayes , maj_vote], clf_labels):
    scores=cross_val_score(
        estimator=clf,
        X = X_train_val,
        y = y_train_val,
        cv = 5,
        scoring='roc_auc'
    )
    print("ROC AUC {} --- {}".format(scores.mean(), label))


ROC AUC 0.679033289114622 --- Logistic Regression
ROC AUC 0.660677857283646 --- Decision Tree
ROC AUC 0.6651053635731221 --- Forest
ROC AUC 0.6436364960476209 --- Bayes
ROC AUC 0.6883574577348286 --- Majority


In [60]:
# Tune LR
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegressionCV

lr = LogisticRegression(random_state=0, n_jobs=-1)


searchCV = LogisticRegressionCV(
    Cs=list(np.power(10.0, np.arange(-10, 10)))
    ,penalty='l2'
    ,scoring='roc_auc'
    ,cv=10
    ,random_state=0
    ,max_iter=10000
    ,fit_intercept=True
    ,solver='newton-cg'
    ,tol=10
)
searchCV.fit(X_train_val, y_train_val)

print ('Max auc_roc:', searchCV.scores_[1].mean(axis=0).max())

# param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

# param_grid = [
#     {
#         'C' : param_range,
#         'penalty' : ['l1']
#     },
#     {
#         'C' : param_range,
#         'penalty' : ['l2']
#     }
    
# ]
# gs = GridSearchCV(
#     estimator=lr,
#     param_grid = param_grid,
#     scoring='accuracy',
#     cv='10',
#     n_jobs=-1
# )

# gs.fit(X_train_val, y_train_val)

# print(gs.best_score_)
# print(gs.best_params_)

Max auc_roc: 0.680980139611


In [61]:
searchCV.C_ #use 0.01 as optimal C 

array([ 0.01])

In [62]:
searchCV.coef_

array([[ 0.11623724, -0.05595402, -0.02210602, -0.032771  ,  0.0142638 ,
         0.10295247,  0.10271472, -0.13805486,  0.07888475,  0.2342548 ,
         0.0230545 , -0.00700787,  0.04164554,  0.06954455, -0.594429  ,
         0.07244163, -0.05907957, -0.06065697,  0.01426193, -0.05541369,
        -0.02518371,  0.01077378,  0.25811737, -0.09284657,  0.03369121,
         0.02312665, -0.02789765, -0.03798999,  0.01087656, -0.0517407 ,
        -0.05820065,  0.00567408, -0.05473658, -0.03271303, -0.07998641,
         0.17207705, -0.05804937, -0.01308426,  0.0106908 , -0.04788135,
         0.06149866, -0.18744716, -0.08397894,  0.03303288, -0.00494813,
         0.07477875, -0.05577817,  0.02717924, -0.11548324,  0.10505653,
         0.04039755,  0.00751381,  0.09504469, -0.03066742,  0.13129981,
         0.25661251,  0.34594965,  0.35765922,  0.31206555,  0.0615941 ,
         0.05278493,  0.1449028 ,  0.0988587 ,  0.07062437, -0.24195384,
        -1.61313872,  0.76887847,  0.7908519 , -1.6