In [96]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from tensorflow.keras.models import load_model
from sklearn.model_selection import GroupShuffleSplit

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [97]:
# Reading all the features
data_train1 = pd.read_csv('my_final/data.csv')
labels_train = data_train1['reordered']

# filling NaN values as -1
data_train1 = data_train1.fillna(-1)

data_test1 = pd.read_csv('my_final/test_data.csv')

# filling NaN values as -1
data_test1 = data_test1.fillna(-1)

In [98]:
data_train = data_train1.drop(['Unnamed: 0', 'order_id','eval_set_x','eval_set_y','reordered','product_name'],axis=1)
data_test = data_test1.drop(['Unnamed: 0', 'order_id','eval_set_x','eval_set_y','reordered','product_name'],axis=1)

X_train = data_train.copy()
y_train = labels_train.copy()
X_cv = data_test.copy()

## Logistic Regression 

In [74]:
LR_Model = LogisticRegression(n_jobs=-1).fit(X_train,y_train)
pred_Xcv = LR_Model.predict(X_cv)
data_test1['pred'] = pred_Xcv
test = data_test1.copy()
test = test[test['pred']==True]
test = test[['order_id','product_id','pred']].groupby('order_id').agg({'product_id': list, 'pred': list})
test['products'] = test['product_id'].apply(lambda x: ' '.join(str(a) for a in x))
here = test[['products']].merge(data_test1[['order_id']].drop_duplicates(subset='order_id', keep="last"),on='order_id', how='outer')
here = here.fillna('None')
here.to_csv('my_final/LRSub.csv',index=False)

## Weighted Logistic Regression

In [75]:
LR_Model = LogisticRegression(class_weight = "balanced", n_jobs=-1).fit(X_train,y_train)
pred_Xcv = LR_Model.predict(X_cv)
data_test1['pred'] = pred_Xcv
test = data_test1.copy()
test = test[test['pred']==True]
test = test[['order_id','product_id','pred']].groupby('order_id').agg({'product_id': list, 'pred': list})
test['products'] = test['product_id'].apply(lambda x: ' '.join(str(a) for a in x))
here = test[['products']].merge(data_test1[['order_id']].drop_duplicates(subset='order_id', keep="last"),on='order_id', how='outer')
here = here.fillna('None')
here.to_csv('my_final/LRWeighSub.csv',index=False)

## Naive Bayes

In [76]:
gnb = GaussianNB().fit(X_train, y_train)
pred_Xcv = gnb.predict(X_cv)
data_test1['pred'] = pred_Xcv
test = data_test1.copy()
test = test[test['pred']==True]
test = test[['order_id','product_id','pred']].groupby('order_id').agg({'product_id': list, 'pred': list})
test['products'] = test['product_id'].apply(lambda x: ' '.join(str(a) for a in x))
here = test[['products']].merge(data_test1[['order_id']].drop_duplicates(subset='order_id', keep="last"),on='order_id', how='outer')
here = here.fillna('None')
here.to_csv('my_final/NBSub.csv',index=False)

## Decision Trees

In [77]:
clf = DecisionTreeClassifier(class_weight='balanced').fit(X_train, y_train)
pred_Xcv = clf.predict(X_cv)
data_test1['pred'] = pred_Xcv
test = data_test1.copy()
test = test[test['pred']==True]
test = test[['order_id','product_id','pred']].groupby('order_id').agg({'product_id': list, 'pred': list})
test['products'] = test['product_id'].apply(lambda x: ' '.join(str(a) for a in x))
here = test[['products']].merge(data_test1[['order_id']].drop_duplicates(subset='order_id', keep="last"),on='order_id', how='outer')
here = here.fillna('None')
here.to_csv('my_final/DTSub.csv',index=False)

## Random Forest

In [78]:
rand_clf = RandomForestClassifier(class_weight='balanced', n_jobs=-1).fit(X_train, y_train)
pred_Xcv = rand_clf.predict(X_cv)
data_test1['pred'] = pred_Xcv
test = data_test1.copy()
test = test[test['pred']==True]
test = test[['order_id','product_id','pred']].groupby('order_id').agg({'product_id': list, 'pred': list})
test['products'] = test['product_id'].apply(lambda x: ' '.join(str(a) for a in x))
here = test[['products']].merge(data_test1[['order_id']].drop_duplicates(subset='order_id', keep="last"),on='order_id', how='outer')
here = here.fillna('None')
here.to_csv('my_final/RFSub.csv',index=False)

## Gradient Boosting with XGBoost

In [79]:
xgb_params = {
    "objective"         : "reg:logistic"
    ,"eval_metric"      : "logloss"
}
num_round = 20

d_train = xgb.DMatrix(X_train, y_train)
watchlist= [(d_train, "train")]
bst = xgb.train(params= xgb_params, dtrain=d_train, num_boost_round=num_round, evals=watchlist,verbose_eval = 10)
pred_Xcv = bst.predict(xgb.DMatrix(X_cv))
pred_Xcv = [True if i >=0.16 else False for i in pred_Xcv]
data_test1['pred'] = pred_Xcv
test = data_test1.copy()
test = test[test['pred']==True]
test = test[['order_id','product_id','pred']].groupby('order_id').agg({'product_id': list, 'pred': list})
test['products'] = test['product_id'].apply(lambda x: ' '.join(str(a) for a in x))
here = test[['products']].merge(data_test1[['order_id']].drop_duplicates(subset='order_id', keep="last"),on='order_id', how='outer')
here = here.fillna('None')
here.to_csv('my_final/GBSub.csv',index=False)

<p>As the from the F1 Score we can get, that best model is Gradient descent with XGBoost. So first I will apply Gradient desent on Autoencoders and if we even can use an Autoencoder or not, then I will do the hyperparameter tuning on the model.</p>

## Trying autoencoder features with Gradient Boosting

#### Autoencoder with Normalization 

In [80]:
# first we need to divide the data for the features that we will apply autencoding to, then merge back

data_train1 = pd.read_csv('my_final/data.csv')
labels_train1 = data_train1['reordered']
# filling NaN values as -1
data_train1 = data_train1.fillna(-1)

data_test1 = pd.read_csv('my_final/test_data.csv')
# filling NaN values as -1
data_test1 = data_test1.fillna(-1)


data_train = data_train1.drop(['Unnamed: 0', 'order_id','eval_set_x','eval_set_y','reordered','product_name','user_id','product_id','order_number','order_dow','order_hour_of_day','days_since_prior_order','aisle_id','department_id'],axis=1)
data_test = data_test1.drop(['Unnamed: 0', 'order_id','eval_set_x','eval_set_y','reordered','product_name','user_id','product_id','order_number','order_dow','order_hour_of_day','days_since_prior_order','aisle_id','department_id'],axis=1)
X_trainEncod = data_train.copy()
y_trainEncod = labels_train.copy()
X_cvEncod = data_test.copy()

# the enoder we trained 
encoder = load_model('autoencoderwithNorm.h5')

# tranforming our data
X_trainEncod = encoder.predict(X_trainEncod)
X_cvEncod = encoder.predict(X_cvEncod)

X_trainMerge = data_train1[['user_id','product_id','order_number','order_dow','order_hour_of_day','days_since_prior_order','aisle_id','department_id']]
X_CVMerge = data_test1[['user_id','product_id','order_number','order_dow','order_hour_of_day','days_since_prior_order','aisle_id','department_id']]

X_trainEncod = pd.DataFrame(data=X_trainEncod, columns=list(range(34)))
X_trainMerge= X_trainMerge.reset_index(drop=True)
X_train = X_trainMerge.merge(X_trainEncod, left_index=True, right_index=True)

X_cvEncod = pd.DataFrame(data=X_cvEncod, columns=list(range(34)))
X_CVMerge= X_CVMerge.reset_index(drop=True)
X_CV = X_CVMerge.merge(X_cvEncod, left_index=True, right_index=True)

# Gradient Boosting to the Autoencoder
xgb_params = {
    "objective"         : "reg:logistic"
    ,"eval_metric"      : "logloss"

}
num_round = 15
d_train = xgb.DMatrix(X_train, y_trainEncod)
watchlist= [(d_train, "train")]
bst = xgb.train(params= xgb_params, dtrain=d_train,num_boost_round=num_round, evals=watchlist,verbose_eval = 10)
pred_Xcv = bst.predict(xgb.DMatrix(X_CV))
arr = [True if i >=0.16 else False for i in pred_Xcv]

data_test1['pred'] = arr
test = data_test1.copy()
test = test[test['pred']==True]
test = test[['order_id','product_id','pred']].groupby('order_id').agg({'product_id': list, 'pred': list})
test['products'] = test['product_id'].apply(lambda x: ' '.join(str(a) for a in x))
here = test[['products']].merge(data_test1[['order_id']].drop_duplicates(subset='order_id', keep="last"),on='order_id', how='outer')
here = here.fillna('None')
here.to_csv('my_final/GBAutoEnodWithNormSub.csv',index=False)

[0]	train-logloss:0.52407
[10]	train-logloss:0.29213
[14]	train-logloss:0.28840


### Autoencoder without Normalization 

In [84]:
# Autoencoder without Normalization 

# first we need to divide the data for the features that we will apply autencoding to, then merge back.

data_train1 = pd.read_csv('my_final/data.csv')
labels_train1 = data_train1['reordered']
# filling NaN values as -1
data_train1 = data_train1.fillna(-1)

data_test1 = pd.read_csv('my_final/test_data.csv')
# filling NaN values as -1
data_test1 = data_test1.fillna(-1)


data_train = data_train1.drop(['Unnamed: 0', 'order_id','eval_set_x','eval_set_y','reordered','product_name','user_id','product_id','order_number','order_dow','order_hour_of_day','days_since_prior_order','aisle_id','department_id'],axis=1)
data_test = data_test1.drop(['Unnamed: 0', 'order_id','eval_set_x','eval_set_y','reordered','product_name','user_id','product_id','order_number','order_dow','order_hour_of_day','days_since_prior_order','aisle_id','department_id'],axis=1)
X_trainEncod = data_train.copy()
y_trainEncod = labels_train.copy()
X_cvEncod = data_test.copy()

# the enoder we trained 
encoder = load_model('autoencoder_withoutNorm.h5')

# tranforming our data
X_trainEncod = encoder.predict(X_trainEncod)
X_cvEncod = encoder.predict(X_cvEncod)

X_trainMerge = data_train1[['user_id','product_id','order_number','order_dow','order_hour_of_day','days_since_prior_order','aisle_id','department_id']]
X_CVMerge = data_test1[['user_id','product_id','order_number','order_dow','order_hour_of_day','days_since_prior_order','aisle_id','department_id']]

X_trainEncod = pd.DataFrame(data=X_trainEncod, columns=list(range(34)))
X_trainMerge= X_trainMerge.reset_index(drop=True)
X_train = X_trainMerge.merge(X_trainEncod, left_index=True, right_index=True)

X_cvEncod = pd.DataFrame(data=X_cvEncod, columns=list(range(34)))
X_CVMerge= X_CVMerge.reset_index(drop=True)
X_CV = X_CVMerge.merge(X_cvEncod, left_index=True, right_index=True)

# Gradient Boosting to the Autoencoder
xgb_params = {
    "objective"         : "reg:logistic"
    ,"eval_metric"      : "logloss"

}
num_round = 15
d_train = xgb.DMatrix(X_train, y_trainEncod)
watchlist= [(d_train, "train")]
bst = xgb.train(params= xgb_params, dtrain=d_train,num_boost_round=num_round, evals=watchlist,verbose_eval = 10)
pred_Xcv = bst.predict(xgb.DMatrix(X_CV))
arr = [True if i >=0.16 else False for i in pred_Xcv]

data_test1['pred'] = arr
test = data_test1.copy()
test = test[test['pred']==True]
test = test[['order_id','product_id','pred']].groupby('order_id').agg({'product_id': list, 'pred': list})
test['products'] = test['product_id'].apply(lambda x: ' '.join(str(a) for a in x))
here = test[['products']].merge(data_test1[['order_id']].drop_duplicates(subset='order_id', keep="last"),on='order_id', how='outer')
here = here.fillna('None')
here.to_csv('my_final/GBAutoEnodWithoutNormSub.csv',index=False)

[0]	train-logloss:0.52333
[10]	train-logloss:0.29256
[14]	train-logloss:0.28910


<p>The encoding of the features doesn't perform as good as the actual features,So it would better to use the actual features instead of encoded ones.</p>

## Gradient Boosting with Best Parameters

In [89]:
xgb_params = {
    "objective"         : "reg:logistic"
    ,"eval_metric"      : "logloss"
    ,"max_depth"        : 6
    ,"subsample"        :0.8
    ,"colsample_bytree" :0.8
    ,"alpha"            :5
    ,"lambda"           :1
}
num_round = 120

d_train = xgb.DMatrix(X_train, y_train)
watchlist= [(d_train, "train")]
bst = xgb.train(params= xgb_params, dtrain=d_train, num_boost_round=num_round, evals=watchlist,verbose_eval = 10)
pred_Xcv = bst.predict(xgb.DMatrix(X_cv))
pred_Xcv = [True if i >=0.16 else False for i in pred_Xcv]
data_test1['pred'] = pred_Xcv
test = data_test1.copy()
test = test[test['pred']==True]
test = test[['order_id','product_id','pred']].groupby('order_id').agg({'product_id': list, 'pred': list})
test['products'] = test['product_id'].apply(lambda x: ' '.join(str(a) for a in x))
here = test[['products']].merge(data_test1[['order_id']].drop_duplicates(subset='order_id', keep="last"),on='order_id', how='outer')
here = here.fillna('None')
here.to_csv('my_final/GBBestSub.csv',index=False)

[0]	train-logloss:0.51082
[10]	train-logloss:0.25091
[20]	train-logloss:0.24593
[30]	train-logloss:0.24484
[40]	train-logloss:0.24409
[50]	train-logloss:0.24349
[60]	train-logloss:0.24297
[70]	train-logloss:0.24260
[80]	train-logloss:0.24224
[90]	train-logloss:0.24194
[100]	train-logloss:0.24166
[110]	train-logloss:0.24141
[119]	train-logloss:0.24119


#### Kaggle Private and Public Scores for Test data with Different Models 
<img src='my_final/Capture.PNG'>

### Simple Neural Network

In [116]:
##imports
from tensorflow.keras.layers import Input, Dense, Activation, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import TensorBoard
from tensorflow_addons.metrics import F1Score
import datetime

##create an NN and 
x_in = Input(shape=(76,))
x_dense1 = Dense(50, activation="relu")(x_in)
x_dropout = Dropout(0.1)(x_dense1)
x_dense2 = Dense(25, activation="relu")(x_dropout)
x_out = Dense(1, activation='sigmoid')(x_dense2)

model = Model(inputs=x_in, outputs=x_out)
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[F1Score(num_classes=1),'accuracy'])


log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

model.fit(X_train, np.where(y_train.values==True,1,0), epochs=10, batch_size=3000, verbose=1, callbacks=[tensorboard_callback])

pred = model.predict(X_cv,verbose=1, batch_size=1000)
pred_Xcv = [True if i >=0.16 else False for i in pred]
data_test1['pred'] = pred_Xcv
test = data_test1.copy()
test = test[test['pred']==True]
test = test[['order_id','product_id','pred']].groupby('order_id').agg({'product_id': list, 'pred': list})
test['products'] = test['product_id'].apply(lambda x: ' '.join(str(a) for a in x))
here = test[['products']].merge(data_test1[['order_id']].drop_duplicates(subset='order_id', keep="last"),on='order_id', how='outer')
here = here.fillna('None')
here.to_csv('temp_fold/NNSub1.csv',index=False)

Model: "functional_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 76)]              0         
_________________________________________________________________
dense_21 (Dense)             (None, 50)                3850      
_________________________________________________________________
dropout_7 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_22 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_23 (Dense)             (None, 1)                 26        
Total params: 5,151
Trainable params: 5,151
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x241fbec6c18>

#### Kaggle Private and Public Scores for NN
<img src='my_final/NN_kaggle.PNG'>