Here we will train a baseline model.
We will first load the test file, this is important because when we deal with messy data we need to make sure that the columns in the train file exist at prediction time!

In [1]:
import pandas as pd
import numpy as np

In [2]:
test_df = pd.read_pickle("data/clean/test_df.pickle")

In [3]:
test_df.columns

Index(['id', 'orderportalid', 'orderdate_gmt', 'designer', 'style', 'shipper',
       'shiptypeid', 'userid', 'isvip', 'country', 'region', 'ddprate',
       'countrycode', 'hasusedwishlist', 'isreseller', 'hasitemsonbag',
       'tierafterorder', 'tierbeforeorder', 'isusingmultipledevices',
       'userfraudstatus'],
      dtype='object')

In [4]:
corr = test_df.corr()
corr[corr>0.99]

Unnamed: 0,orderportalid,designer,style,shipper,shiptypeid,userid,country,region,ddprate,countrycode,userfraudstatus
orderportalid,1.0,,,,,,,,,,
designer,,1.0,0.999711,,,,,,,,
style,,0.999711,1.0,,,,,,,,
shipper,,,,1.0,,,,,,,
shiptypeid,,,,,1.0,,,,,,
userid,,,,,,1.0,,,,,
country,,,,,,,1.0,,,0.999818,
region,,,,,,,,1.0,,,
ddprate,,,,,,,,,1.0,,
countrycode,,,,,,,0.999818,,,1.0,


*Note*, you can also use [pandas-profiling!](https://github.com/pandas-profiling/pandas-profiling)

In [5]:
#from pandas_profiling import ProfileReport
#report = ProfileReport(test_df)
#report

We will remove  the following columns because they are highly correlated with other columns:
- countrycode (0.999918 corr with country)
- style (0.99971 corr with designer)

In [6]:
test_df.head()

Unnamed: 0,id,orderportalid,orderdate_gmt,designer,style,shipper,shiptypeid,userid,isvip,country,region,ddprate,countrycode,hasusedwishlist,isreseller,hasitemsonbag,tierafterorder,tierbeforeorder,isusingmultipledevices,userfraudstatus
0,3f15fccff2058ced0fdac7cb6718a656,341282,2018-02-23 16:50:58.403000+00:00,140484,140515,1,3,29264.0,VIP,9,3,0.0,9,No,No,No,T1,T1,No,4
1,cfb2173ff64d6b7440a47e71de9b4731,341283,2018-02-23 16:51:13.510000+00:00,128189,128219,4,2,5424.0,Not VIP,48,4,0.0,48,Yes,Yes,No,VIP,VIP,Yes,6
2,3368f22dd34bf95e53395dcdfe2bcdb6,341283,2018-02-23 16:51:13.510000+00:00,128189,128219,4,2,5424.0,Not VIP,48,4,0.0,48,Yes,Yes,No,VIP,VIP,Yes,6
3,cf433b79db5396f9812586c04b2cdd45,341284,2018-02-23 16:51:45.596000+00:00,128949,128979,2,2,121201.0,VIP,17,4,24.3652,17,Yes,No,Yes,VIP,VIP,No,4
4,9976b334d2fe399c2a2b8709e42078ec,341284,2018-02-23 16:51:45.596000+00:00,137342,137373,2,2,121201.0,VIP,17,4,24.3652,17,Yes,No,Yes,VIP,VIP,No,4


`test_df.orderdate_gmt` is a datetime, we can parse it in later iterations but for now we remove it.

In [7]:
test_df = test_df.drop(columns=['countrycode', 'orderdate_gmt', 'style'])

Now we load the training data that we previously cleaned.

In [8]:
train_df = pd.read_pickle('data/clean/train_df_merged.pkl')

In [9]:
#train_report = ProfileReport(train_df)
#train_report

We evaluate cardinality of categoricals.

In [10]:
cardinality = train_df.describe(exclude=np.number).T

In [11]:
cardinality

Unnamed: 0,count,unique,top,freq
brand,543341,2616,87,19211
category_1stlevel,542687,33,Clothing,278172
country,543341,328,1,94021
countrycode,543341,327,1,94021
countryoforigin,543341,374,1,216945
ddprate,543341,1482,0.0,347593
ddpsubcategory,497997,127,"Jerseys, pullovers, cardigans, waistcoats and ...",59037
designer,543341,142692,2264,1574
freereturn,543341,4,1,471193
hasitemsonbag,542584,2,No,298293


We see the column `ddprate` is a numerical (top value is 0.0) but it's not considered a category.

In [12]:
train_df.ddprate.value_counts(True)

0.0        0.639733
5.0083     0.076127
0.0        0.064433
24.3652    0.013999
33.3833    0.012541
15.4853    0.012440
28.7029    0.009427
35.7236    0.009134
5.0083     0.009110
5.008      0.008155
5.8485     0.007428
17.0858    0.006320
42.2351    0.005917
34.1055    0.004566
7.6715     0.004261
31.0431    0.004194
12.9314    0.003755
5.1665     0.003580
14.5035    0.003550
12.6688    0.003162
16.2855    0.002717
19.1182    0.002567
37.4837    0.002363
19.4557    0.002087
26.3626    0.001986
20.0       0.001804
40.404     0.001647
43.2407    0.001542
46.557     0.001410
24.3652    0.001382
             ...   
46.5879    0.000002
37.5889    0.000002
12.7452    0.000002
30.4892    0.000002
11.4175    0.000002
28.4613    0.000002
38.0336    0.000002
18.8374    0.000002
14.8314    0.000002
17.8786    0.000002
44.7903    0.000002
11.7463    0.000002
11.9865    0.000002
10.229     0.000002
15.6533    0.000002
29.12      0.000002
35.9539    0.000002
11.84      0.000002
34.4537    0.000002


So it seems like it's a number, but somehow got defined as a categorical. Also we see there are two different kinds of 0.0?

In [13]:
train_df.ddprate.value_counts().head().index.values

array(['0.0', '5.0083', 0.0, '24.3652', '33.3833'], dtype=object)

In [14]:
train_df.ddprate = pd.to_numeric(train_df.ddprate, errors='coerce')

In [15]:
cardinality[cardinality.unique>100000]

Unnamed: 0,count,unique,top,freq
designer,543341,142692,2264,1574
orderdate_gmt,543341,347163,2018-01-10 15:28:51.970000+00:00,94
orderportalid,543341,367897,83553,86
productid,543341,166878,109557,866
style,543341,169984,2267,1477
userid,542584,262910,2247,564


We see there are a bunch of columns that are almost ids, we should use a hashingencoder to deal with them but for now we remove them.

In [16]:
train_df = train_df.drop(columns=['orderportalid', 'userid', 'style', 'productid',
                                  'orderdate_gmt', 'designer'])

In [17]:
target_name = 'returned'
y = train_df[target_name]
X = train_df.drop(columns=[col for col in train_df if col not in test_df.columns])

For the moment we will remove `orderportalid` since it has high cardinality.

In [18]:
X.describe(exclude=np.number).T

Unnamed: 0,count,unique,top,freq
country,543341,328,1,94021
hasitemsonbag,542584,2,No,298293
hasusedwishlist,542584,2,Yes,393562
isreseller,542584,2,No,507561
isusingmultipledevices,542584,2,No,336439
isvip,542584,4,Not VIP,424596
region,543341,14,3,199540
shipper,543341,6,2,359441
shiptypeid,543341,45,2,381650
tierafterorder,505189,5,T4,231316


We save it for later.

In [19]:
X.assign(returned=y).reset_index().to_csv('train.csv', index=False)

# Baseline Model

In [20]:
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [22]:
print(X_train.shape)

(489006, 13)


In [23]:
pipeline = make_pipeline(
 OneHotEncoder(verbose=1),
    SimpleImputer(),
    LogisticRegression(verbose=1)
)

For the first run let's just train on a sample, to make it faster.

In [24]:
pipeline.fit(X_train.head(100000), y_train.values[:100000])



[LibLinear]

Pipeline(memory=None,
         steps=[('onehotencoder',
                 OneHotEncoder(cols=['country', 'hasitemsonbag',
                                     'hasusedwishlist', 'isreseller',
                                     'isusingmultipledevices', 'isvip',
                                     'region', 'shipper', 'shiptypeid',
                                     'tierafterorder', 'tierbeforeorder',
                                     'userfraudstatus'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', return_df=True,
                               use_cat_names=False, verbose...
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                  

In [25]:
preds = pipeline.predict(X_test)

In [26]:
from sklearn.metrics import roc_auc_score

In [27]:
roc_auc_score(preds, y_test)

0.8339974200772695

In [28]:
X_test.shape

(54335, 13)

We load the test data.

In [29]:
test_df = pd.read_pickle('data/clean/test_df.pickle')
test_ids = test_df.id.values

In [30]:
test_df = test_df[[col for col in test_df.columns  if col in X.columns ]]

In [31]:
test_df.columns

Index(['shipper', 'shiptypeid', 'isvip', 'country', 'region', 'ddprate',
       'hasusedwishlist', 'isreseller', 'hasitemsonbag', 'tierafterorder',
       'tierbeforeorder', 'isusingmultipledevices', 'userfraudstatus'],
      dtype='object')

In [32]:
test_preds = pipeline.predict_proba(test_df)[:,1]

In [33]:
baseline = pd.DataFrame({
    "id":test_ids,
    "returned":  test_preds
    
})

In [34]:
baseline.to_csv('baseline.csv', index=False)

We got ROCAUC of 0.68 on the leaderboard with 100k rows. Let's try with the whole dataset.

In [35]:
# This takes a while to run
pipeline.fit(X_train, y_train)



[LibLinear]

Pipeline(memory=None,
         steps=[('onehotencoder',
                 OneHotEncoder(cols=['country', 'hasitemsonbag',
                                     'hasusedwishlist', 'isreseller',
                                     'isusingmultipledevices', 'isvip',
                                     'region', 'shipper', 'shiptypeid',
                                     'tierafterorder', 'tierbeforeorder',
                                     'userfraudstatus'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', return_df=True,
                               use_cat_names=False, verbose...
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='mean',
                               verbose=0)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                  

In [36]:
preds = pipeline.predict(X_test)

In [37]:
roc_auc_score(preds, y_test)

0.83405822486589

In [38]:
test_preds2 = pipeline.predict_proba(test_df)[:,1]

In [39]:
baseline2 = pd.DataFrame({
    "id":test_ids,
    "returned":  test_preds2
    
})
baseline2.to_csv('baseline.2.csv', index=False)

In [40]:
print('done')

done


This got a 0.773!

We save the test df just in case

In [41]:
test_df['id'] = test_ids
test_df.to_csv('test.csv', index=False)