In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler


%matplotlib inline

In [2]:
ss = pd.read_csv('SampleSubmission_XkIpo3X.csv')

train = pd.read_csv('Train_eP48B9k.csv')
test = pd.read_csv('Test_jPKyvmK.csv')

train.head()

Unnamed: 0,id,customer_age,job_type,marital,education,default,balance,housing_loan,personal_loan,communication_type,day_of_month,month,last_contact_duration,num_contacts_in_campaign,days_since_prev_campaign_contact,num_contacts_prev_campaign,prev_campaign_outcome,term_deposit_subscribed
0,id_43823,28.0,management,single,tertiary,no,285.0,yes,no,unknown,26,jun,303.0,4.0,,0,unknown,0
1,id_32289,34.0,blue-collar,married,secondary,no,934.0,no,yes,cellular,18,nov,143.0,2.0,132.0,1,other,0
2,id_10523,46.0,technician,married,secondary,no,656.0,no,no,cellular,5,feb,101.0,4.0,,0,unknown,0
3,id_43951,34.0,services,single,secondary,no,2.0,yes,no,unknown,20,may,127.0,3.0,,0,unknown,0
4,id_40992,41.0,blue-collar,married,primary,no,1352.0,yes,no,cellular,13,may,49.0,2.0,,0,unknown,0


In [3]:
df = pd.concat([train, test], axis=0).reset_index(drop = True)
df.shape

(45211, 18)

In [4]:
ID_COL, TARGET_COL = 'id', 'term_deposit_subscribed'
features = [c for c in train.columns if c not in [ID_COL, TARGET_COL]]

cat_cols = ['job_type',
 'marital',
 'education',
 'default',
 'housing_loan',
 'personal_loan',
 'communication_type',
 'month',
 'prev_campaign_outcome']

num_cols = [c for c in features if c not in cat_cols]

def join_df(train, test):

  df = pd.concat([train, test], axis=0).reset_index(drop = True)
  features = [c for c in df.columns if c not in [ID_COL, TARGET_COL]]

  return df, features

def split_df_and_get_features(df, train_nrows):

  train, test = df[:train_nrows].reset_index(drop = True), df[train_nrows:].reset_index(drop = True)
  features = [c for c in train.columns if c not in [ID_COL, TARGET_COL]]
  
  return train, test, features

cat_cols

['job_type',
 'marital',
 'education',
 'default',
 'housing_loan',
 'personal_loan',
 'communication_type',
 'month',
 'prev_campaign_outcome']

In [5]:
df = pd.get_dummies(df, columns = cat_cols)

In [6]:
df, features = join_df(train, test)

In [7]:
df.select_dtypes('object').columns.tolist()

['id',
 'job_type',
 'marital',
 'education',
 'default',
 'housing_loan',
 'personal_loan',
 'communication_type',
 'month',
 'prev_campaign_outcome']

In [8]:
df, features = join_df(train, test)

### Label Encoding
df[cat_cols] = df[cat_cols].apply(lambda x: pd.factorize(x)[0])

#### No NaN filling required.

In [9]:
def get_frequency_features(df, cols):

  df[[c + '_freq' for c in cols]] = df[cols].apply(lambda x: x.map(x.value_counts()))
  
  return df

In [10]:
freq_cols = [c for c in df.columns if (train[c].nunique() > 10) and (train[c].nunique() < 50)]
freq_cols

['job_type',
 'day_of_month',
 'month',
 'num_contacts_in_campaign',
 'num_contacts_prev_campaign']

In [11]:
df = df.fillna(df.mean())
df.isnull().sum().sum()

0

In [12]:
def download_preds(preds_test, file_name = 'hacklive_sub.csv'):

  ## 1. Setting the target column with our obtained predictions
  ss[TARGET_COL] = preds_test

  ## 2. Saving our predictions to a csv file

  ss.to_csv(file_name, index = False)


In [13]:
df

Unnamed: 0,id,customer_age,job_type,marital,education,default,balance,housing_loan,personal_loan,communication_type,day_of_month,month,last_contact_duration,num_contacts_in_campaign,days_since_prev_campaign_contact,num_contacts_prev_campaign,prev_campaign_outcome,term_deposit_subscribed
0,id_43823,28.000000,0,0,0,0,285.0,0,0,0,26,0,303.0,4.0,224.577692,0,0,0.000000
1,id_32289,34.000000,1,1,1,0,934.0,1,1,1,18,1,143.0,2.0,132.000000,1,1,0.000000
2,id_10523,46.000000,2,1,1,0,656.0,1,0,1,5,2,101.0,4.0,224.577692,0,0,0.000000
3,id_43951,34.000000,3,0,1,0,2.0,0,0,0,20,3,127.0,3.0,224.577692,0,0,0.000000
4,id_40992,41.000000,1,1,2,0,1352.0,0,0,1,13,3,49.0,2.0,224.577692,0,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,id_42406,29.000000,0,0,0,0,717.0,0,0,1,15,0,94.0,1.0,224.577692,0,0,0.107246
45207,id_14483,40.442579,1,1,1,0,604.0,0,0,2,10,6,155.0,1.0,224.577692,0,0,0.107246
45208,id_43066,45.000000,1,1,2,0,237.0,0,0,0,16,3,231.0,4.0,224.577692,0,0,0.107246
45209,id_18375,52.000000,6,1,2,0,241.0,0,0,1,13,3,243.0,2.0,299.000000,5,2,0.107246


In [14]:
df['month_and_day_of_month'] = pd.factorize(df['month'].astype('str') + df['day_of_month'].astype('str'))[0]
df = get_frequency_features(df, ['month_and_day_of_month'])

df['mean_age_for_customer_on_month_and_day_of_month'] = df.groupby('month_and_day_of_month')['customer_age'].transform('mean')
train_proc, test_proc, features = split_df_and_get_features(df, train.shape[0])

In [34]:
def scheduler(epoch):
    if epoch < 10:
        return 0.0001
    else:
        return 0.0001 * tf.math.exp(0.1 * (10 - epoch))

trn, val = train_test_split(train_proc, test_size=0.2, random_state = 1, stratify = train_proc[TARGET_COL])

###### Input to our model will be the features
X_trn, X_val = trn[features], val[features]

###### Output of our model will be the TARGET_COL
y_trn, y_val = trn[TARGET_COL], val[TARGET_COL]

##### Features for the test data that we will be predicting
X_test = test_proc[features]


scaler = StandardScaler()
_ = scaler.fit(X_trn)

X_trn = scaler.transform(X_trn)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

X_trn[:,0]

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1,activation='sigmoid')
])


model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

lr_callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
model.fit(X_trn, y_trn, epochs=50, callbacks=[lr_callback])
model.evaluate(X_val, y_val)

model.summary()

model.save('my_model.h5')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_5 (Flatten)          (None, 19)                0         
_________________________________________________________________
dense_39 (Dense)             (None, 256)               5120      
_________________________________________________________

In [35]:
X_val

array([[-0.9808284 , -0.32465708,  0.27905359, ..., -0.20902905,
        -0.48141769,  0.24761597],
       [ 1.36918048,  1.72599201,  0.27905359, ..., -0.22715311,
        -0.24531816, -0.18714019],
       [-0.60482698, -0.66643193,  0.27905359, ..., -1.15147993,
         1.97549306, -0.51202164],
       ...,
       [ 0.61717764, -0.66643193,  0.27905359, ..., -0.37214555,
         2.4329359 , -0.29694706],
       [ 0.61717764, -0.32465708,  0.27905359, ..., -0.04591256,
        -0.78023116, -0.23867248],
       [-0.60482698,  0.01711777,  0.27905359, ..., -0.55338611,
         0.61792075, -0.87179073]])

In [36]:
preds_val = model.predict(X_val)

f1_score(y_val, preds_val.round())

0.6272912423625254

In [37]:
preds_test = model.predict(X_test)

download_preds(preds_test.round(), file_name='hacklive_DNN_21.csv')