In [None]:
##### This Python 3 environment comes with many helpful analytics libraries installed
##### It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
##### For example, here's several helpful packages to load in 

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import category_encoders as ce
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.model_selection import KFold
from sklearn import linear_model

##### Input data files are available in the "../input/" directory.
##### For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

##### Any results you write to the current directory are saved as output.

In [None]:
#read in the files
sample_submission = pd.read_csv("../input/cat-in-the-dat/sample_submission.csv")
test = pd.read_csv("../input/cat-in-the-dat/test.csv")
train = pd.read_csv("../input/cat-in-the-dat/train.csv")

## Exploratory Data Analysis

In [None]:
print("Shape of training data: {}" .format(train.shape))
train.head()

In [None]:
train[['nom_5','nom_6','nom_7','nom_8','nom_9']].iloc[1]

In [None]:
bytes.fromhex('b6dd5612').decode(encoding='cp037',errors='ignore')


In [None]:
train.columns

We can see that the features of the dataset consists of mainly **binary**, **nominal**, **ordinal** and **cyclical**(day,month) features <br>
<br>
**Binary**: Consists of only two unique values. e.g 1/0, True/False, Yes/No <br>
**Nominal**: Values that does not have any ordering <br>
**Ordinal**: Values with specific ordering <br>
**Cyclical**: Values that are recurrent in nature

#### Distribution of Target


In [None]:
ax = sns.barplot(x='target',y='target', data=train, estimator=lambda x:len(x)/len(train)*100)
ax.set(ylabel ="Frequency in %", title="distribution of target")


### Binary Variables <br>
We will first investigate the distribution of 0s and 1s in the Binary variables

In [None]:
bin_col = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']


grid = gridspec.GridSpec(3, 2)
plt.figure(figsize=(15,20))
for i,col in enumerate(train[bin_col]):
    ax = plt.subplot(grid[i]) 
    sns.countplot(x=col, data=train) 
    ax.set_ylabel('Count')  
    ax.set_xlabel('Values') 
    ax.set_title('{} Distribution'.format(col))
    sizes=[] 
    for p in ax.patches:
        height = p.get_height()
        sizes.append(height)
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(height/len(train)*100),
                ha="center", fontsize=14) 
    ax.set_ylim(0, max(sizes) * 1.15) 
    
plt.show()

### Preprocessing Binary Variables <br>
Machine Learning algorithm works on numerical data, hence, we will have to map boolean values from *bin_3* and *bin_4* into 0s and 1s

In [None]:
mapper = {'T': 1, 'F':0, 'Y':1, 'N':0}
#map char into 0/1
train['bin_3'] = train['bin_3'].map(mapper)
test['bin_3'] = test['bin_3'].map(mapper)
train['bin_4'] = train['bin_4'].map(mapper)
test['bin_4'] = test['bin_4'].map(mapper)

In [None]:
print(train['bin_3'].value_counts())
print(train['bin_4'].value_counts())

### Nominal Variables <br>
We will try to visualize the values and its distribution for the nominal variables

In [None]:
nom_col = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

for col in nom_col:
    print("Number of unique values for {}: {}".format(col,train[col].nunique()))

We see that *nom_5* to *nom_9* has high cardianality (alot of unique values). For visualisation purposes, we will just look at the nominal features with less than 10 unique values[](http://)

In [None]:
plt.figure(figsize=(15,20))
grid = gridspec.GridSpec(3, 2)

for i,col in enumerate(train[nom_col[:5]]):
    ax = plt.subplot(grid[i]) 
    sns.countplot(x=col, data=train) 
    ax.set_ylabel('Count')  
    ax.set_xlabel('Values') 
    ax.set_title('{} Distribution'.format(col))
    sizes=[] 
    for p in ax.patches:
        height = p.get_height()
        sizes.append(height)
        ax.text(p.get_x()+p.get_width()/2.,
                height + 3,
                '{:1.2f}%'.format(height/len(train)*100),
                ha="center", fontsize=14) 
    ax.set_ylim(0, max(sizes) * 1.15) 
    
plt.show()

#### One-Hot encoding <br>
We will use one-hot encoding in this case for *nom_0* to *nom_4*. One-hot encoding will create K number os columns based on the unique values and assign a binary value indicating the presence of the features

In [None]:
train = pd.get_dummies(train, prefix=nom_col[0:5], columns=nom_col[0:5])
test = pd.get_dummies(test, prefix=nom_col[0:5], columns=nom_col[0:5])
train.head()

In [None]:
train.columns

#### Nominal Features with high cardianality <br>
Lets look at *nom_5* to *nom_9* values to find out more about the high cardianality features. They are mainly made up of alphanumeic strings that doesnt have any meaning


In [None]:
# # investigate if nom_5 to nom_9 has similar values

# n5 = train[['nom_5']]; n6 = train[['nom_6']]; n7 = train[['nom_7']]; n8 = train[['nom_8']]; n9 = train[['nom_9']]
# def coll(df):
#     df.columns = ['nom']
#     return df
# long = [n5,n6,n7,n8,n9]
# nSum = 0
# for i in long:
#     i = coll(i)
#     nSum += i.nunique()
# print(nSum)

# nom = pd.concat(long,ignore_index=True)
# print(nom.shape)
# print(nom.nunique())    #all are distinct with one another

In this case, i will try to use:
1. **Frequency Encoding**, which extracts the occurrence of teach strings as feature rpresentation for the nominal features with high cardianlity
2. **Decoding hexademical**, which decodes the hexa values to 1 and 0. Then split into respective columns as similar to one-hot encoding

#### Frequency Encoding

In [None]:
# for col in nom_col[5:]:
#     col_name = col+'_freq_encode'
#     fe_train = train.groupby(col).size()/len(train)
#     train[col_name] = train[col].map(fe_train)
#     fe_test = test.groupby(col).size()/len(test)
#     test[col_name] = test[col].map(fe_test)

#### Hashing

In [None]:
for col in nom_col[5:]:
    col_name = col+'_hash_encode'
    train[col_name] = train[col].apply(lambda x:hash(str(x)) % 6011)
    test[col_name] = test[col].apply(lambda x:hash(str(x)) % 6011)

#### Decoding Hexadecimal values

In [None]:
from category_encoders import TargetEncoder
encoder = TargetEncoder()
for col in nom_col[5:]:
    colname = col+'_target'
    train[colname] = encoder.fit_transform(train['nom_5'], train['target'])
    test[colname] = encoder.transform(test['nom_5'])

In [None]:
#function to decode hexa to binary
def hex_to_bin(val):
    base = 16
    num_bits = 36
    res = bin(int(val, base))[2:].zfill(num_bits)
    return str(res)

#split bin values into individual columns
def decode_(df):
    for col in nom_col[5:]:
        df[col] = df[col].apply(lambda x:hex_to_bin(x))
        
    for col in nom_col[5:]:
        df[col] = df[col].apply(lambda x:list(x))
        new_col = pd.DataFrame(df[col].values.tolist()).rename(columns = lambda x: col+'_decode_'+str(x))
        df = pd.concat([df,new_col],axis=1)
        
    return df

#run for test and train
train = decode_(train)
test = decode_(test)

In [None]:
#drop original nom_5 to nom_9
train.drop(nom_col[5:], axis=1, inplace=True)
test.drop(nom_col[5:], axis=1, inplace=True)

### Ordinal Variable


In [None]:
ord_col = ['ord_0','ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
train[ord_col].nunique()  #find out number of distinct categories for each feature

#### Ordinal_0

In [None]:
train['ord_0'].value_counts()

*ord_0* features are already labeled in an ordinal manner, so we can skip the encoding

 #### Ordinal_1 to Ordinal_4

In [None]:
print(train['ord_1'].value_counts())
print(train['ord_2'].value_counts())

According to the data, *ord_3* and *ord_4* are made up of ordered ASCII character. So we will encode them into respective ASCII code

In [None]:
map1 = {'Novice':1, 'Contributor':2, 'Expert':3, 'Master':4, 'Grandmaster':5}
map2 = {'Freezing':1, 'Cold':2, 'Warm':3, 'Hot':4, 'Boiling Hot':5, 'Lava Hot':6}
# map3 = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 
#                 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15}
# map4 = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 
#                 'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15,
#                 'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 
#                 'W': 23, 'X': 24, 'Y': 25, 'Z': 26}

train['ord_1_encode'] = train['ord_1'].map(map1)
test['ord_1_encode'] = test['ord_1'].map(map1)
train['ord_2_encode'] = train['ord_2'].map(map2)
test['ord_2_encode'] = test['ord_2'].map(map2)
train['ord_3_encode'] = train['ord_3'].apply(lambda x: ord(x))
test['ord_3_encode'] = test['ord_3'].apply(lambda x: ord(x))
train['ord_4_encode'] = train['ord_4'].apply(lambda x: ord(x))
test['ord_4_encode'] = test['ord_4'].apply(lambda x: ord(x))

#### Ordinal_5 <br>
The intuition here is that we see the nature of ordinality in accordance to the assigned ASCII code number..<br>
In this case, we split the 2-char string into individual char, before mapping it into our pre-defined mapper by multiplying the ASCII numbers

In [None]:
def split(word): 
    lst = [char for char in word]
    return ','.join(lst)

train['ord_5'] = train['ord_5'].apply(lambda x: split(x))
test['ord_5'] = test['ord_5'].apply(lambda x: split(x))

In [None]:
train[['ord_5_1','ord_5_2']] = train['ord_5'].str.split(',',expand=True)
test[['ord_5_1','ord_5_2']] = test['ord_5'].str.split(',',expand=True)

In [None]:
train['ord_5_1_encode'] = train['ord_5_1'].apply(lambda x: ord(x))
test['ord_5_1_encode'] = test['ord_5_1'].apply(lambda x: ord(x))

train['ord_5_2_encode'] = train['ord_5_2'].apply(lambda x: ord(x))
test['ord_5_2_encode'] = test['ord_5_2'].apply(lambda x: ord(x))


#multiple the values together (1st char higher weightage)
# train['ord_5_encode'] = (train['ord_5_1_encode'] * 100) + train['ord_5_2_encode']
# test['ord_5_encode'] = (test['ord_5_1_encode'] * 100) + test['ord_5_2_encode']

### Cyclical Variables <br>
https://www.kaggle.com/avanwyk/encoding-cyclical-features-for-deep-learning

In [None]:
# train['day_encode_sin'] = np.sin(2 * np.pi * train['day']/7.0)
# train['day_encode_cos'] = np.cos(2 * np.pi * train['day']/7.0)
# train['month_encode_sin'] = np.sin(2 * np.pi * train['month']/12.0)
# train['month_encode_cos'] = np.cos(2 * np.pi * train['month']/12.0)

# test['day_encode_sin'] = np.sin(2 * np.pi * test['day']/7.0)
# test['day_encode_cos'] = np.cos(2 * np.pi * test['day']/7.0)
# test['month_encode_sin'] = np.sin(2 * np.pi * test['month']/12.0)
# test['month_encode_cos'] = np.cos(2 * np.pi * test['month']/12.0)

train = pd.get_dummies(train, prefix=['day','month'], columns=['day','month'],sparse=True)   #one-hot apparently works better than using cyclical encoding
test = pd.get_dummies(test, prefix=['day','month'], columns=['day','month'],sparse=True)

In [None]:
train.columns

## Preprocessing

In [None]:
#Drop unencoded variables
testid = test['id']
train.drop(['id','bin_0','ord_1','ord_2','ord_3','ord_4','ord_5','ord_5_1','ord_5_2'],axis=1,inplace=True)
test.drop(['id','bin_0','ord_1','ord_2','ord_3','ord_4','ord_5','ord_5_1','ord_5_2'],axis=1,inplace=True)

In [None]:
print(train.columns)
train.head()

### Standardisation

In [None]:
from sklearn.preprocessing import MinMaxScaler

scale_col = ['ord_3_encode','ord_4_encode','ord_5_1_encode','ord_5_2_encode']
scaler = MinMaxScaler()
scaler.fit(train[scale_col])


train[scale_col] = scaler.transform(train[scale_col])
test[scale_col] = scaler.transform(test[scale_col] )

In [None]:
train.shape

> ## Model Training

In [None]:
# Get feature vector and target vector
y = train['target']
X = train.drop(['target'],axis=1)

In [None]:
# Split train to train validation set
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.3)

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE()
trainX, trainy = oversample.fit_resample(trainX, trainy)

In [None]:
# reduce density of matrix by introducing sparsity
# X = X.to_sparse()
# X = X.sparse.to_coo()
# X = X.astype(int)
# X = X.tocsr()

In [None]:
print(trainX.shape)
print(trainy.shape)
print(testX.shape)
print(testy.shape)

In [None]:
# kf=StratifiedKFold(n_splits=10)

# def objective(trial):
#     C=trial.suggest_loguniform('C', 10e-10, 1)
#     model=linear_model.LogisticRegression(C=C, class_weight='balanced',max_iter=2000, solver='lbfgs', n_jobs=-1)
#     score=-cross_val_score(model, trainX, trainy, cv=kf, scoring='roc_auc').mean()
#     return score
# study=optuna.create_study()

In [None]:
#study.optimize(objective, n_trials=10)

In [None]:
# print(study.best_params)
# print(-study.best_value)
# params=study.best_params

### Logistic Regression <br>
Apparently performs on-par or sometimes even better than other complex models


In [None]:
# scores = []
# glm = linear_model.LogisticRegression(C=0.1,solver="liblinear",tol=0.00001, max_iter=10000)
# cv = KFold(n_splits=10, random_state=42)
# for tr_idx, val_idx in cv.split(trainX):
#     train_x = trainX.iloc[tr_idx]
#     train_y = trainy.iloc[tr_idx]
#     val_x = trainX.iloc[val_idx]
#     val_y = trainy.iloc[val_idx]

#     glm.fit(train_x,train_y)
#     y_pred_lr=glm.predict_proba(train_x)
#     scores.append(roc_auc_score(train_y, y_pred_lr[:,1]))

In [None]:

glm = linear_model.LogisticRegression(C=0.095,solver="liblinear",tol=0.00001, max_iter=10000) 
glm.fit(trainX, trainy)

In [None]:
y_pred_lr=glm.predict_proba(trainX)
roc_auc_score(trainy, y_pred_lr[:,1])

In [None]:
y_pred_lr=glm.predict_proba(testX)
roc_auc_score(testy, y_pred_lr[:,1])

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

y_preds = glm.predict_proba(testX)
y_pred = np.where(y_preds > 0.5, 1, 0)
conf = confusion_matrix(testy, y_pred)
plt.imshow(conf, cmap='binary', interpolation='None')
plt.show()

print(classification_report(testy, y_pred))


In [None]:
from sklearn.metrics import plot_confusion_matrix

disp = plot_confusion_matrix(glm, testX, testy,
                            cmap=plt.cm.Blues,
                            normalize='true')



In [None]:
sub = glm.predict_proba(test)
submission = pd.DataFrame({'id': testid, 'target': sub[:,1]})
submission.to_csv('submission.csv', index=False)

### Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA()
lda.fit(trainX, trainy)

In [None]:
y_pred_lda=lda.predict_proba(trainX)
roc_auc_score(trainy, y_pred_lda[:,1])

In [None]:
y_pred_lda=lda.predict_proba(testX)
roc_auc_score(testy, y_pred_lda[:,1])

In [None]:
disp = plot_confusion_matrix(lda, testX, testy,
                            cmap=plt.cm.Blues,
                            normalize='true')

In [None]:
from sklearn.ensemble import AdaBoostClassifier as ada
boost = ada()
boost.fit(trainX,trainy)

In [None]:
y_pred_ada=boost.predict_proba(testX)
roc_auc_score(testy, y_pred_ada[:,1])

In [None]:
disp = plot_confusion_matrix(boost, testX, testy,
                            cmap=plt.cm.Blues,
                            normalize='true')

In [None]:
from sklearn.naive_bayes import ComplementNB 
nb = ComplementNB()
nb.fit(trainX,trainy)

y_pred_nb=nb.predict_proba(testX)
roc_auc_score(testy, y_pred_nb[:,1])

In [None]:
disp = plot_confusion_matrix(nb, testX, testy,
                            cmap=plt.cm.Blues,
                            normalize='true')

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
estimator = [('lda', LDA()),
             ('boost', ada()),
             ('glm', LogisticRegression(C=0.095,solver="liblinear",tol=0.00001, max_iter=10000))
             ]

clf = StackingClassifier(estimators=estimator, final_estimator=MLPClassifier(hidden_layer_sizes=(50,)), stack_method='predict_proba')
clf.fit(trainX,trainy)

In [None]:
y_pred_clf=clf.predict_proba(testX)
roc_auc_score(testy, y_pred_clf[:,1])