In [7]:
%tensorflow_version 2.x
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras import regularizers


## Read the data set
df = pd.read_csv("https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv", na_values=['NA', '?'])


## Generate dummies for job
df = pd.concat([df, pd.get_dummies(df['job'], prefix='job')], axis=1)
df.drop('job', axis=1, inplace=True)


## Generate dummies for area
df = pd.concat([df, pd.get_dummies(df['area'], prefix='area')], axis=1)
df.drop('area', axis=1, inplace=True)


## Missing values for income
df['income'] = df['income'].fillna(df['income'].median())


## Standardizing data
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['age'] = zscore(df['age'])
df['save_rate'] = zscore(df['save_rate'])
df['subscriptions'] = zscore(df['subscriptions'])


## Making input and target data
x_headers = df.columns.drop('product').drop('id')
x = df[x_headers].values
dummies = pd.get_dummies(df['product'])
y = dummies.values


## K_Fold Cross-Validate
kf = KFold(n_splits=5, shuffle=True, random_state=42)

oos_y = []
oos_pred = []
fold = 0

for train, test in kf.split(x):
  fold+=1
  print(f"Fold #{fold}")

  # Preparing training and test set
  x_train = x[train]
  y_train = y[train]
  x_test = x[test]
  y_test = y[test]

  # Building model
  model = Sequential()
  model.add(Dense(50, input_dim=x.shape[1], activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(25, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(1e-4)))
  model.add(Dense(y.shape[1], activation='softmax'))

  model.compile(loss='categorical_crossentropy', optimizer='adam')
  model.fit(x_train,y_train,validation_data=(x_test,y_test),verbose=0,epochs=500)

  pred = model.predict(x_test)
  expected_y = np.argmax(pred, axis=1)
  true_y = np.argmax(y_test, axis=1)
  oos_pred.append(expected_y)
  oos_y.append(true_y)

  score = metrics.accuracy_score(true_y, expected_y)
  print(f"Fold{fold} accuracy is : {score}")

## Build the oos prediction list and calculate the accuracy
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = metrics.accuracy_score(oos_y, oos_pred)
print("---------------------------------------------------------------------------------------")
print(f"Final, out of sample score (Accuracy) is : {score}")    
print("---------------------------------------------------------------------------------------")

## Write the cross-validated prediction
oos_y = pd.DataFrame({"oos_y":oos_y})
oos_pred = pd.DataFrame({"oos_pred":oos_pred})
oosDF = pd.concat( [df, oos_y, oos_pred],axis=1 )
display(oos_y)
display(oos_pred)
display(oosDF)


Fold #1
Fold1 accuracy is : 0.685
Fold #2
Fold2 accuracy is : 0.7475
Fold #3
Fold3 accuracy is : 0.7075
Fold #4
Fold4 accuracy is : 0.6925
Fold #5
Fold5 accuracy is : 0.6675
---------------------------------------------------------------------------------------
Final, out of sample score (Accuracy) is : 0.7
---------------------------------------------------------------------------------------


Unnamed: 0,oos_y
0,1
1,1
2,2
3,1
4,1
...,...
1995,1
1996,2
1997,1
1998,1


Unnamed: 0,oos_pred
0,2
1,1
2,2
3,2
4,1
...,...
1995,2
1996,2
1997,1
1998,1


Unnamed: 0,id,income,aspect,subscriptions,dist_healthy,save_rate,dist_unhealthy,age,pop_dense,retail_dense,crime,product,job_11,job_al,job_am,job_ax,job_bf,job_by,job_cv,job_de,job_dz,job_e2,job_f8,job_gj,job_gv,job_kd,job_ke,job_kl,job_kp,job_ks,job_kw,job_mm,job_nb,job_nn,job_ob,job_pe,job_po,job_pq,job_pz,job_qp,job_qw,job_rn,job_sa,job_vv,job_zz,area_a,area_b,area_c,area_d,oos_y,oos_pred
0,1,-0.607550,-0.664918,-0.208449,9.017895,-0.215764,11.738935,0.854321,0.885827,0.492126,0.071100,b,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,2
1,2,0.338053,-0.207748,0.839031,7.766643,0.196869,6.805396,1.394432,0.874016,0.342520,0.400809,c,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1
2,3,-0.184205,1.127906,-0.208449,3.632069,-0.714362,13.671772,-0.495957,0.944882,0.724409,0.207723,b,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,2,2
3,4,-0.526467,-0.440815,-0.208449,5.372942,-0.542432,4.333286,1.124377,0.889764,0.444882,0.361216,b,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,2
4,5,-2.851675,1.638861,1.886511,3.822477,-0.473660,5.967121,-2.116291,0.744094,0.661417,0.068033,a,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1996,-0.593504,1.414758,-0.208449,5.454545,-0.232957,14.013489,-1.306124,0.881890,0.744094,0.104838,b,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,2
1996,1997,-3.028085,1.011372,0.839031,3.632069,-0.473660,8.380497,-2.116291,0.944882,0.877953,0.063851,a,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2
1997,1998,-2.826971,1.513363,1.886511,7.168218,0.884591,4.626950,-2.656402,0.759843,0.744094,0.098703,f,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
1998,1999,1.093101,-1.274478,-1.255928,8.936292,-0.370502,3.281439,0.044154,0.909449,0.598425,0.117803,c,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1
