## Simple Baseline script
* Uses CatBoost (Has built in embedding support for categoricals, such as the string columns)
* Not compared with OneHot encoding handling of string/categorical columns yet, or xgboost, lightgbm (the later can also handle categoricals natively). 

    *Good luck!

* Target: *is_female*

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))
# Any results you write to the current directory are saved as output.

In [2]:
from catboost import CatBoostClassifier
# from xgboost import XGBClassifier
import lightgbm as lgb

TARGET = "is_female"

#### Lots of columns!
* Some are strings, some are boolean or of very low cardinality ( 3-7 unique values). 
* Lot's of NaNs. 

In [3]:
df = pd.read_csv("train.csv",low_memory=False)
print(df.shape)
df.head()

(18255, 1235)


Unnamed: 0,train_id,AA3,AA4,AA5,AA6,AA7,AA14,AA15,DG1,is_female,...,GN1,GN1_OTHERS,GN2,GN2_OTHERS,GN3,GN3_OTHERS,GN4,GN4_OTHERS,GN5,GN5_OTHERS
0,0,3,32,3.0,,323011,3854,481,1975,1,...,99.0,,99,,99,,99,,99,
1,1,2,26,,8.0,268131,2441,344,1981,1,...,,,1,,2,,2,,2,
2,2,1,16,,7.0,167581,754,143,1995,1,...,1.0,,2,,2,,2,,2,
3,3,4,44,5.0,,445071,5705,604,1980,1,...,,,2,,2,,99,,99,
4,4,4,43,,6.0,436161,5645,592,1958,1,...,,,1,,1,,1,,1,


### Note: Test, Train have different ID columns!
* ordering reset to 0 for each. 
* Best to drop unless a useful leak is identified (But then more annoying to output test set predictions, if train has different # columns). 
    * Ignore for now

In [4]:
test = pd.read_csv("test.csv",low_memory=False)
print(test.shape)

(27285, 1234)


#### Lots of columns which look to be based on survey responses/multiple choice questions.
* in this case, Nulls may be the result of picking a question choice, vs not being answered. Requires digging into the data to understand how it should be addressed case by case. 
    * In short: missing value imputation may be damaging!  

In [5]:
df.isnull().sum()

train_id                0
AA3                     0
AA4                     0
AA5                 12602
AA6                  5653
AA7                     0
AA14                    0
AA15                    0
DG1                     0
is_female               0
DG3                     0
DG3A                    0
DG3A_OTHERS         18205
DG4                     0
DG4_OTHERS          18255
DG5_1                   0
DG5_2                   0
DG5_3                   0
DG5_4                   0
DG5_5                   0
DG5_6                   0
DG5_7                   0
DG5_8                   0
DG5_9                   0
DG5_10                  0
DG5_11                  0
DG5_96                  0
DG6                     0
DG8a                    0
DG8b                    0
                    ...  
FB28_2_OTHERS       18253
FB28_3_OTHERS       18255
FB28_4_OTHERS       18253
FB28_96_OTHERS      18254
FB29_1                  0
FB29_2                  0
FB29_3                  0
FB29_4      

## Examine Non numeric columns:
* Clear possibilities here to get less sparse features: "column is not NaN", or "sum(notNaN)(col) for col in (cols begginning with DL[0-9]"... 

In [6]:
# https://stackoverflow.com/questions/25039626/how-do-i-find-numeric-columns-in-pandas
df.select_dtypes(exclude=[np.number])

Unnamed: 0,DG3A_OTHERS,DG13_OTHERS,DG14_OTHERS,DL1_OTHERS,DL2_23_OTHERS,DL2_96_OTHERS,DL4_OTHERS,DL12_OTHERS,DL28_OTHERS,G2P1_OTHERS,...,FB28_4_OTHERS,FB28_96_OTHERS,FB29_OTHERS,LN2_RIndLngBEOth,LN2_WIndLngBEOth,GN1_OTHERS,GN2_OTHERS,GN3_OTHERS,GN4_OTHERS,GN5_OTHERS
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,Bengali,Bengali,,,,,
2,,,,,,,,,,Samajvadi pension,...,,,,Hindi,Hindi,,,,,
3,,,,,,,,,,,...,,,,Tamil,Tamil,,,,,
4,,,,,,,,,,,...,,,,Malayalam,Malayalam,,,,,
5,,,,,,,,,,,...,,,,Chattisgari,Chattisgari,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,Telugu,Telugu,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,Marathi,Marathi,,,,,


In [7]:
category_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
print(len(category_cols))

96


### convert categorical columns to integers
* Our test set has categoricals not seen in train - this must be handled. For now, we'll get the categoricals from train and test together, ensuring we "see"/encode" all the categoricals

In [8]:
# Save # rows in train/test. (We could do this directly, but this is easier to debug if needed)
TR_ROWS = df.shape[0]
print(df.shape)
print(test.shape)

(18255, 1235)
(27285, 1234)


In [9]:
df_all = pd.concat([df,test])
df_all.shape

(45540, 1236)

## I get errors with LGB and other when using the Categorical datatype: we'll turn it back into Integers to avoid this

In [10]:
for header in category_cols:
#     df[header] = df[header].astype('category').cat.codes
#     test[header] = test[header].astype('category').cat.codes
    df_all[header] = df_all[header].astype('category').cat.codes.astype('int')
    df_all[header] = pd.to_numeric(df_all[header])
    
    df[header] = df[header].astype('category').cat.codes
    df[header] = pd.to_numeric(df[header])
    test[header] = test[header].astype('category').cat.codes
    test[header] = pd.to_numeric(test[header])

In [11]:
df.shape

(18255, 1235)

#### Split back into train and test

In [12]:
df =df_all.iloc[0:TR_ROWS]
test =df_all.iloc[TR_ROWS:]

df.drop(['test_id'],axis=1,inplace=True)
test.drop(['train_id',"is_female"],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


### Naive initial feature engineering:
* NaNS per row
(Could also be done for groups of columns, or to sum 0/1s.. (

In [13]:
df["row_nulls"] = df.isnull().sum(axis=1)
test["row_nulls"] = test.isnull().sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Build a model
* Note: CatBoost in particular has a LOT of hyperparams (it's even worse than LightGBM in this regard). IT's essential to experiment with them if you want to get decent results. 
* This is my first time using it, so assume my hyperparameters are terrible. 
* Tuning should use a seperate train/validation set split first to select hyperparams. 

* For low dimensional categoricals (e.g. <20 unique vars) - there's no benefit in embedding (Catboost/lightGBM) vs simply leaving it as a number or one hot encoding. 

In [14]:
X = df.drop([TARGET],axis=1)# .select_dtypes(include=[np.number]) #.values
Y = df[TARGET]

In [17]:
### Optional Train/Validation split for test hyperparams. 
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [15]:
from preprocess2 import *
X,test = preprocess(X, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  test[header] = test[header].astype('category').cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  test[header] = pd.to_numeric(test[header])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [84]:
from sklearn.metrics import confusion_matrix
def classify(clf,x_n_train, y_n_train, x_n_test,y_n_test,test, useall = False):
    if useall:
        clf.fit(x_n_train, y_n_train)
#         test_n = test.reindex(columns = x_n_train.columns)
        test_n = test
    else:
        clf.fit(x_n_train, y_n_train)

        y_pred = clf.predict(x_n_test)

        cm = confusion_matrix(y_n_test, y_pred)
        print (cm)

        y_prob = clf.predict_proba(x_n_test)
        print(metrics.roc_auc_score(y_n_test, y_prob[:,1]))

        test_n = test.reindex(columns = x_n_test.columns)

    y_pred_f1 = clf.predict(test_n)
    y_prob = clf.predict_proba(test_n)
    return y_prob

In [18]:
cat_dims = [X.columns.get_loc(i) for i in category_cols[:-1]]  # categorical columns indexes

# train default catBoost classifier. Default loss metric is LogLoss (lower is better)
clf = CatBoostClassifier(eval_metric="AUC", one_hot_max_size=3, iterations=2)
from sklearn import metrics
# clf.fit(X,Y , cat_features=cat_dims)
classify(clf,X_train, y_train, X_test,y_test,test, useall = False)

0:	learn: 0.9219430	total: 141ms	remaining: 141ms
1:	learn: 0.9272174	total: 235ms	remaining: 0us
[[2132  416]
 [ 350 2579]]
0.928431406179


array([[ 0.45940806,  0.54059194],
       [ 0.51654069,  0.48345931],
       [ 0.45414245,  0.54585755],
       ..., 
       [ 0.53469114,  0.46530886],
       [ 0.48194774,  0.51805226],
       [ 0.49402819,  0.50597181]])

## Currently there's an error where when creating predictions - need to debug

In [24]:
res = clf.predict_proba(test)

In [25]:
res

array([[ 0.52319765,  0.47680235],
       [ 0.55015584,  0.44984416],
       [ 0.54251844,  0.45748156],
       ..., 
       [ 0.54251844,  0.45748156],
       [ 0.54251844,  0.45748156],
       [ 0.55015584,  0.44984416]])

## Additional models
* Train performance is dangeorusly misleading without an external validation set. 
* This is just a starter for models
* Should also check for overfitting (requires validation set split)

In [23]:
from selfea import *

In [65]:
# DataInit = InitData()

train = DataInit.get_train(False)
test = DataInit.get_test(False)

X_new = DataInit.get_train(True)
Y_new = DataInit.get_labels()
T_new = DataInit.get_test(True)
# T_new = T_new.reindex(columns = X_new)

In [48]:
len(test[])

27285

In [66]:
train2, test2 = preprocess(train, test)

In [67]:
fea = DataInit.get_features()

In [68]:
train2[fea] = X_new

In [70]:
process_data(test)

In [71]:
test2[fea] = test[fea]

In [81]:
train2.columns

Index(['train_id', 'AA3', 'AA4', 'AA5', 'AA6', 'AA7', 'AA14', 'AA15', 'DG1',
       'is_female',
       ...
       'MT1A_f', 'DL2_new', 'G2P1_11_new', 'GN2_new', 'FF14_6_new',
       'FF14_5_new', 'FF14_4_new', 'FF14_3_new', 'MT18_5_new', 'MT18_4_new'],
      dtype='object', length=1246)

In [73]:
len(test)

27285

In [60]:
test2 = test[fea]

In [52]:
T_new

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

TypeError: unhashable type: 'numpy.ndarray'

In [5]:
# 
ytrain = train2.is_female
xtrain = train2.drop(["is_female"],axis = 1)
# train.is_female

In [85]:
# increase # iterations when model debugged
clf2 = CatBoostClassifier(eval_metric="AUC", one_hot_max_size=6,
                          iterations=50,depth=8,learning_rate=0.04, rsm=0.8)

# clf2.fit(xtrain,ytrain)
# res2 = clf2.predict_proba(test2)
# classify(clf2,X_train, y_train, X_test,y_test,test, useall = False)
res5 = classify(clf2,train2.drop([TARGET],axis=1).values,train2.is_female, None,None,test2,True)

0:	learn: 0.9172374	total: 164ms	remaining: 8.04s
1:	learn: 0.9414040	total: 326ms	remaining: 7.83s
2:	learn: 0.9441676	total: 497ms	remaining: 7.79s
3:	learn: 0.9458347	total: 660ms	remaining: 7.59s
4:	learn: 0.9454476	total: 828ms	remaining: 7.45s
5:	learn: 0.9498443	total: 994ms	remaining: 7.29s
6:	learn: 0.9499815	total: 1.17s	remaining: 7.17s
7:	learn: 0.9507401	total: 1.34s	remaining: 7.03s
8:	learn: 0.9515597	total: 1.52s	remaining: 6.92s
9:	learn: 0.9512022	total: 1.68s	remaining: 6.72s
10:	learn: 0.9528523	total: 1.85s	remaining: 6.58s
11:	learn: 0.9549246	total: 2.02s	remaining: 6.38s
12:	learn: 0.9558251	total: 2.18s	remaining: 6.2s
13:	learn: 0.9561767	total: 2.33s	remaining: 6s
14:	learn: 0.9561315	total: 2.48s	remaining: 5.78s
15:	learn: 0.9563126	total: 2.64s	remaining: 5.61s
16:	learn: 0.9569005	total: 2.81s	remaining: 5.45s
17:	learn: 0.9571982	total: 2.97s	remaining: 5.28s
18:	learn: 0.9577399	total: 3.13s	remaining: 5.11s
19:	learn: 0.9586100	total: 3.31s	remaining: 

In [86]:
res5

array([[ 0.62021403,  0.37978597],
       [ 0.57584973,  0.42415027],
       [ 0.67222974,  0.32777026],
       ..., 
       [ 0.52816241,  0.47183759],
       [ 0.60896794,  0.39103206],
       [ 0.67664623,  0.32335377]])

In [24]:
write_file("sub2.csv",res5)

## Simple LightGBM model:
* https://github.com/Microsoft/LightGBM/issues/1096

In [28]:
# we still ahve object datatypes columns..
df.dtypes.value_counts()

float64    910
int64      240
object      96
dtype: int64

In [20]:
# categorical/objects cols:
print(df.select_dtypes(exclude=[np.number]).columns.tolist())
# df.select_dtypes(exclude=[np.number]).value_counts()

['DG4_OTHERS', 'FB28_3_OTHERS', 'G2P2_10_OTHERS', 'G2P2_12_OTHERS', 'G2P2_15_OTHERS', 'G2P2_2_OTHERS', 'MM11_11_OTHERS', 'MM11_5_OTHERS', 'MM15_OTHERS', 'MM38_OTHERS', 'MT13_4_OTHERS', 'MT13_96_OTHERS', 'MT14_3_OTHERS', 'MT14_5_OTHERS', 'MT14_7_OTHERS']


In [74]:
# lgb_train = lgb.Dataset(
# #             data=LabelEncoder().fit_transform(train_df.brand_name).reshape(-1, 1),
#     data=df.drop([TARGET],axis=1).select_dtypes(include=[np.number]).values,
#     label = df[TARGET],
# #     categorical_feature=cat_dims   
#         )

lgb_train = lgb.Dataset(
#             data=LabelEncoder().fit_transform(train_df.brand_name).reshape(-1, 1),
    data=train2.drop([TARGET],axis=1).values,
    label = train2[TARGET],
#     categorical_feature=cat_dims   
        )

In [75]:
# https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
t4_params = {
    'boosting_type': 'gbdt', 'objective': 'binary', 'nthread': -1, 'silent': False,
    'num_leaves': 2**4, 'learning_rate': 0.05, 'max_depth': 11,
    'max_bin': 255, 
    'subsample': 0.8, 'subsample_freq': 1, 'colsample_bytree': 0.75, 
#     'early_stopping_round' : 10,
    'min_split_gain': 0.5, 'min_child_samples': 4}

clf_lgb = lgb.train(t4_params,lgb_train)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


In [76]:
res_lgbm = clf_lgb.predict(test2)

In [77]:
len(res_lgbm)

27285

In [78]:
res_lgbm

array([ 0.40402583,  0.67728182,  0.45158917, ...,  0.7333295 ,
        0.19231664,  0.46090632])

In [32]:
write_file("sub3.csv",res_lgbm)

IndexError: invalid index to scalar variable.

#### Once more predictions work, can join and get mean of predictions = simple blending ensemble

In [33]:
# test["lgbm_preds"]=res_lgbm
test["is_female"]=res_lgbm

In [16]:
print(res2[:,1])

[ 0.62025603  0.61955426  0.53572687 ...,  0.52719749  0.51435788
  0.52155349]


In [15]:
print(res_lgbm)

NameError: name 'res_lgbm' is not defined

In [42]:
test["is_female"] = res5

ValueError: Wrong number of items passed 2, placement implies 1

In [34]:
test["test_id"] = test["test_id"].astype(int)

In [35]:
test[["test_id","is_female"]].to_csv("submission6.csv",index=False)

In [28]:
# preds = pd.DataFrame(columns=[test["test_id"].copy(),res_lgbm]
# preds = test["test_id"].copy()
# preds["is_female"] = res_lgbm# ensemble/mean of others at this points

In [29]:
# preds.head()

In [30]:
# preds.to_csv("submission.csv.gz",index=False,compression="gzip")