# Regression example, K fold cross validation 

In [1]:
import pandas as pd
from scipy.stats import zscore
from sklearn.model_selection import train_test_split

# Read the data set
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])

# Generate dummies for job
df = pd.concat([df,pd.get_dummies(df['job'],prefix="job")],axis=1)
df.drop('job', axis=1, inplace=True)

# Generate dummies for area
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area")],axis=1)
df.drop('area', axis=1, inplace=True)

# Generate dummies for product
df = pd.concat([df,pd.get_dummies(df['product'],prefix="product")],axis=1)
df.drop('product', axis=1, inplace=True)

# Missing values for income
med = df['income'].median()
df['income'] = df['income'].fillna(med)

# Standardize ranges
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['save_rate'] = zscore(df['save_rate'])
df['subscriptions'] = zscore(df['subscriptions'])

# Convert to numpy - Classification
x_columns = df.columns.drop('age').drop('id')
x = df[x_columns].values
y = df['age'].values

In [5]:
EPOCHS=500

import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping

# Cross-Validate
kf = KFold(5, shuffle=True, random_state=42) # Use for KFold classification
oos_y = []
oos_pred = []

fold = 0
for train, test in kf.split(x):
    fold+=1
    print(f"Fold #{fold}")
        
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    model = Sequential()

    # Hidden 1
    model.add(Dense(20, input_dim=x.shape[1], 
            activation='relu',
            activity_regularizer=regularizers.l1(1e-4))) # L1 regularization
    model.add(Dropout(0.5))
    # Hidden 2
    model.add(Dense(10, activation='relu', activity_regularizer=regularizers.l1(1e-4)))

    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    

    # min_delta This value should be kept small. It simply means the minimum change in error to be 
    # registered as an improvement. Setting it even smaller will not likely have a great deal of impact.

    # patience How long should the training wait for the validation error to improve?

    # verbose How much progress information do you want?

    # mode In general, always set this to "auto". This allows you to specify if the error should be minimized or maximized.
    # Consider accuracy, where higher numbers are desired vs log-loss/RMSE where lower numbers are desired.

    # restore_best_weights This should always be set to true. This restores the weights to the values they were at when the validation set is the highest.

    monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, 
        patience=5, verbose=1, mode='auto',
        restore_best_weights=True)
    
    model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor], verbose=2,epochs=EPOCHS)

    pred = model.predict(x_test)
    
    oos_y.append(y_test)
    oos_pred.append(pred)    

    # Measure this fold's RMSE
    score = np.sqrt(metrics.mean_squared_error(pred,y_test))
    print(f"Fold score (RMSE): {score}")

# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
score = np.sqrt(metrics.mean_squared_error(oos_pred,oos_y))
print(f"Final, out of sample score (RMSE): {score}")    
    
# Write the cross-validated prediction
oos_y = pd.DataFrame(oos_y)
oos_pred = pd.DataFrame(oos_pred)
oosDF = pd.concat( [df, oos_y, oos_pred],axis=1 )
#oosDF.to_csv(filename_write,index=False)

Fold #1
Epoch 1/500
50/50 - 1s - loss: 1476.9642 - val_loss: 1129.6224 - 901ms/epoch - 18ms/step
Epoch 2/500
50/50 - 0s - loss: 725.7018 - val_loss: 346.8909 - 94ms/epoch - 2ms/step
Epoch 3/500
50/50 - 0s - loss: 440.3012 - val_loss: 254.0956 - 100ms/epoch - 2ms/step
Epoch 4/500
50/50 - 0s - loss: 436.8361 - val_loss: 250.5816 - 101ms/epoch - 2ms/step
Epoch 5/500
50/50 - 0s - loss: 380.3863 - val_loss: 219.8620 - 99ms/epoch - 2ms/step
Epoch 6/500
50/50 - 0s - loss: 341.2834 - val_loss: 210.2957 - 103ms/epoch - 2ms/step
Epoch 7/500
50/50 - 0s - loss: 329.9769 - val_loss: 178.5247 - 104ms/epoch - 2ms/step
Epoch 8/500
50/50 - 0s - loss: 306.5561 - val_loss: 152.3449 - 108ms/epoch - 2ms/step
Epoch 9/500
50/50 - 0s - loss: 269.0076 - val_loss: 145.3539 - 106ms/epoch - 2ms/step
Epoch 10/500
50/50 - 0s - loss: 256.0235 - val_loss: 137.7809 - 92ms/epoch - 2ms/step
Epoch 11/500
50/50 - 0s - loss: 247.6692 - val_loss: 113.5274 - 99ms/epoch - 2ms/step
Epoch 12/500
50/50 - 0s - loss: 210.4362 - va

# Classification with Stratified K-Fold Cross-Validation

In [None]:
import pandas as pd
from scipy.stats import zscore

# Read the data set
df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])

# Generate dummies for job
df = pd.concat([df,pd.get_dummies(df['job'],prefix="job")],axis=1)
df.drop('job', axis=1, inplace=True)

# Generate dummies for area
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area")],axis=1)
df.drop('area', axis=1, inplace=True)

# Missing values for income
med = df['income'].median()
df['income'] = df['income'].fillna(med)

# Standardize ranges
df['income'] = zscore(df['income'])
df['aspect'] = zscore(df['aspect'])
df['save_rate'] = zscore(df['save_rate'])
df['age'] = zscore(df['age'])
df['subscriptions'] = zscore(df['subscriptions'])

# Convert to numpy - Classification
x_columns = df.columns.drop('product').drop('id')
x = df[x_columns].values
dummies = pd.get_dummies(df['product']) # Classification
products = dummies.columns
y = dummies.values

In [4]:
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout

# np.argmax(pred,axis=1)
# Cross-validate
# Use for StratifiedKFold classification
kf = StratifiedKFold(5, shuffle=True, random_state=42) 
    
oos_y = []
oos_pred = []
fold = 0

# Must specify y StratifiedKFold for
for train, test in kf.split(x,df['product']):  
    fold+=1
    print(f"Fold #{fold}")
        
    x_train = x[train]
    y_train = y[train]
    x_test = x[test]
    y_test = y[test]
    
    model = Sequential()
    # Hidden 1
    model.add(Dense(50, input_dim=x.shape[1], activation='relu')) 
    model.add(Dropout(0.5))
    model.add(Dense(25, activation='relu')) # Hidden 2
    model.add(Dense(y.shape[1],activation='softmax')) # Output
    model.compile(loss='categorical_crossentropy', optimizer='adam')

    model.fit(x_train,y_train,validation_data=(x_test,y_test),
              verbose=0, epochs=EPOCHS)
    
    pred = model.predict(x_test)
    
    oos_y.append(y_test)
    # raw probabilities to chosen class (highest probability)
    pred = np.argmax(pred,axis=1) 
    oos_pred.append(pred)  

    # Measure this fold's accuracy
    y_compare = np.argmax(y_test,axis=1) # For accuracy calculation
    score = metrics.accuracy_score(y_compare, pred)
    print(f"Fold score (accuracy): {score}")

# Build the oos prediction list and calculate the error.
oos_y = np.concatenate(oos_y)
oos_pred = np.concatenate(oos_pred)
oos_y_compare = np.argmax(oos_y,axis=1) # For accuracy calculation

score = metrics.accuracy_score(oos_y_compare, oos_pred)
print(f"Final score (accuracy): {score}")    
    
# Write the cross-validated prediction
oos_y = pd.DataFrame(oos_y)
oos_pred = pd.DataFrame(oos_pred)
oosDF = pd.concat( [df, oos_y, oos_pred],axis=1 )
#oosDF.to_csv(filename_write,index=False)

KeyError: ignored