# MSDS 411 Assignment 3

In [None]:
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
from tensorflow import keras
import tensorflow.keras.layers
import tensorflow.keras.models
import tensorflow.keras.optimizers
import tensorflow.keras.datasets

from scipy.stats import reciprocal
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score


tensorflow.keras.backend.clear_session()
np.random.seed(42)
# tensorflow.random.set_seed(42)


### Load Data

In [None]:
#train = pd.read_csv('input/Kannada-MNIST/train.csv')
#test = pd.read_csv('input/Kannada-MNIST/test.csv')
train = pd.read_csv('train_X.csv')
train_y = pd.read_csv('train_y.csv')
test = pd.read_csv('test_X.csv')
full_train = train.merge(train_y)

In [None]:
train_y.head()


In [None]:
test_y = pd.read_csv('test_y.csv')

In [None]:
test_y.head()

In [None]:
train = train.drop(columns = {'Unnamed: 0'})
train_y = train_y.drop(columns = {'Unnamed: 0'})
test_y = test_y.drop(columns = {'Unnamed: 0'})
test = test.drop(columns = {'Unnamed: 0'})
full_train = full_train.drop(columns = {'Unnamed: 0'})

full_train['x'] = pd.get_dummies(full_train['x'], drop_first=True)

train_y['x'] = pd.get_dummies(train_y['x'], drop_first=True)
test_y['x'] = pd.get_dummies(test_y['x'], drop_first=True)

In [None]:
train_y.head()

In [None]:
test.head()

# Method 1: Train all layers at once

Let's build a stacked Autoencoder with 3 hidden layers and 1 output layer (i.e., 2 stacked Autoencoders).

In [None]:
def rounded_accuracy(y_true, y_pred):
    return keras.metrics.binary_accuracy(tf.round(y_true), tf.round(y_pred))

In [None]:
#import keras
#from keras import layers

# This is the size of our encoded representations
encoding_dim = 24 # 32 floats -> compression of factor 24.5, assuming the input is 784 floats
encoding_dim1 = 11 


input_img = keras.Input(shape =  (44,))
# "encoded" is the encoded representation of the input
encoded = keras.layers.Dense(encoding_dim, activation='tanh')(input_img)

encoded1 = keras.layers.Dense(encoding_dim1)(encoded)


encoder = keras.Model(input_img, encoded)
# "decoded" is the lossy reconstruction of the input
decoded = keras.layers.Dense(44, activation='softplus')(encoded)

# This model maps an input to its reconstruction
autoencoder = keras.Model(input_img, decoded)


In [None]:
# This is our encoded (32-dimensional) input
#encoded_input = keras.Input(shape=(encoding_dim,))
# Retrieve the last layer of the autoencoder model
#decoder_layer = autoencoder.layers[-1]
# Create the decoder model
#decoder = keras.Model(encoded_input, decoder_layer(encoded_input))

autoencoder.compile(optimizer='Adadelta', loss='binary_crossentropy')


In [None]:
# tensorflow.random.set_seed(42)
np.random.seed(42)
import tensorflow as tf

history = autoencoder.fit(train, train,
                epochs=3000,
                batch_size=2200,
                shuffle=False,
                validation_data=(test, test))

# history = autoencoder.fit(train, train, epochs=200,validation_data=(test, test))

In [None]:
autoencoder.get_weights()

# Saving the weights from this model as well
autoencoder.save('autoencoder_classification.h5')


In [None]:
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0,2)
plt.show()

### Visualizing Data Before and After Encoding/Decoding

In [None]:
np.random.seed(42)

from sklearn.manifold import TSNE

X_valid_compressed = autoencoder.predict(test)
tsne = TSNE()
X_valid_2D = tsne.fit_transform(X_valid_compressed)
X_valid_2D = (X_valid_2D - X_valid_2D.min()) / (X_valid_2D.max() - X_valid_2D.min())

In [None]:
test_y.head()

In [None]:

plt.scatter(X_valid_2D[:, 0], X_valid_2D[:, 1],c=test_y['x'], s=10, cmap="tab10")
plt.axis("off")
plt.show()

Let's make this diagram a bit prettier:

# Prediction using Encoded images

In [None]:
ae_train = autoencoder.predict(train)
ae_test = autoencoder.predict(test)

### Using AE Predictions for Log Regress

In [None]:
ae_train.shape
ae_train_df = pd.DataFrame(ae_train)
ae_train_df.to_csv('C:/Users/brook/Documents/MSDS/MSDS 411/Assignment 3/ae_train.csv')

ae_test_df = pd.DataFrame(ae_test)
ae_test_df.to_csv('C:/Users/brook/Documents/MSDS/MSDS 411/Assignment 3/ae_test.csv')


ae_train_df.head()

In [None]:

from sklearn.linear_model import LogisticRegression


log_model = LogisticRegression(penalty = "l2", solver = "lbfgs", C=1.5,fit_intercept = True, max_iter=1500, 
                               multi_class ='auto', warm_start = True)
log_model.fit(ae_train, train_y)
print("Training Set Score: {:.3f}".format(log_model.score(ae_train, train_y)*100))



In [None]:
from sklearn.metrics import roc_auc_score


predictions = log_model.predict(ae_test)
print("Train_Test Set Score: {:.3f}".format(log_model.score(ae_test, test_y)*100))
acc_log = round(roc_auc_score(test_y, predictions)*100 , 2)
acc_log

In [None]:
# optimal cutoff for predicting bad credit set as
# (cost of false negative/cost of false positive) times
# (prevalence of positive/prevalence of negative)
# (1/5)*(.3/.7) = 0.086

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report
import scikitplot as skplt

skplt.metrics.plot_confusion_matrix(test_y, predictions, normalize=False)
from sklearn.metrics import confusion_matrix


print(classification_report(test_y,predictions))
print(confusion_matrix(test_y,predictions))


In [None]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(test_y, predictions)
plt.plot(recalls, precisions, label = 'prec-recall')
plt.xlabel('recall')
plt.ylabel('precision')
plt.legend(loc="upper right")
plt.title("Precision-Recall")


### Autoencoder encoding into Log Regress

In [None]:
# load the model from file
encoder =tf.keras.models.load_model('autoencoder_classification.h5')

In [None]:
test.shape

In [None]:
#### tf.random.set_seed(42)
np.random.seed(42)

denoising_encoder = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[44,]),
    keras.layers.GaussianNoise(0.04),
    keras.layers.Dense(22, activation="relu"),
    keras.layers.Dense(13, activation="relu")
])
denoising_decoder = keras.models.Sequential([
    keras.layers.Dense(22, activation="relu", input_shape=[13]),
    keras.layers.Dense(44, activation="relu")
])
denoising_ae = keras.models.Sequential([denoising_encoder, denoising_decoder])
denoising_ae.compile(loss="binary_crossentropy", optimizer=keras.optimizers.Adadelta(lr=1),
                     metrics=[rounded_accuracy])
history = denoising_ae.fit(train, train, epochs=2000,
                            batch_size=20,
                           validation_data=(test, test))

In [None]:
# encode the train data
X_train_encode = denoising_ae.predict(train)
# encode the test data
X_test_encode = denoising_ae.predict(test)

In [None]:
# define the model
model = LogisticRegression()
# fit the model on the training set
model.fit(X_train_encode, train_y)
# make predictions on the test set
yhat = model.predict(X_test_encode)

In [None]:
skplt.metrics.plot_confusion_matrix(test_y, yhat, normalize=False)
from sklearn.metrics import confusion_matrix


print(classification_report(test_y,yhat))
print(confusion_matrix(test_y,yhat))

In [None]:
ae_train = denoising_ae.predict(train)
ae_test = denoising_ae.predict(test)

ae_train.shape
ae_train_df = pd.DataFrame(ae_train)
ae_train_df.to_csv('C:/Users/brook/Documents/MSDS/MSDS 411/Assignment 3/ae_train.csv')

ae_test_df = pd.DataFrame(ae_test)
ae_test_df.to_csv('C:/Users/brook/Documents/MSDS/MSDS 411/Assignment 3/ae_test.csv')

### Using PCA 

In [None]:
from sklearn.decomposition import PCA
from datetime import datetime
pca = PCA(n_components=17, random_state = 42)
start=datetime.now()
pca = pca.fit(train)
end=datetime.now()
print(end-start)
sum(pca.explained_variance_ratio_)

In [None]:
per_var = np.round(pca.explained_variance_ratio_ * 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var) + 1)]
plt.figure(figsize=(25,15))
plt.bar(x=range(1, len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('percentange of explained variance')
plt.xlabel('principal component')
plt.title('scree plot')
plt.xticks(rotation=90)
plt.show()

In [None]:
pca

In [None]:
from sklearn.linear_model import LogisticRegression
train_features = pca.transform(train)

log_model_pca = LogisticRegression(penalty = "l2", solver = "lbfgs", C=1.5,fit_intercept = True, max_iter=1500, 
                               multi_class ='auto', warm_start = True)
log_model_pca.fit(train_features, train_y)
print("Training Set Score: {:.3f}".format(log_model_pca.score(train_features, train_y)*100))

In [None]:
test_features = pca.transform(test)
predictions_pca = log_model_pca.predict(test_features)
print("Train_Test Set Score: {:.3f}".format(log_model_pca.score(test_features, test_y)*100))
acc_log_pca = round(roc_auc_score(test_y, predictions_pca)*100 , 2)
acc_log_pca

In [None]:
skplt.metrics.plot_confusion_matrix(test_y, predictions_pca, normalize=False)
from sklearn.metrics import confusion_matrix


print(classification_report(test_y,predictions_pca))
print(confusion_matrix(test_y,predictions_pca))