In [None]:
import pandas as pd
import numpy as np

from collections import defaultdict
import sklearn.preprocessing
import scipy.sparse
import sklearn
import sklearn.linear_model
import sklearn.model_selection

import matplotlib.pyplot as plt
import seaborn as sns

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100.0


In [None]:
#Dummy data for regression

tmp_df = pd.DataFrame([['a',1,2],['b',3,4],['a',5,6],['b',7,8],['a',9,10],['b',11,12]])
tmp_df.columns = ['c1','c2','c3']
categorical_features = ['c1','c2']
X_train = X_test = pd.get_dummies(tmp_df[categorical_features])
Y_train = Y_test = tmp_df['c3']

enc_dict = defaultdict(sklearn.preprocessing.LabelEncoder)
ohe = sklearn.preprocessing.OneHotEncoder()
tmp_le_df = tmp_df[categorical_features].apply(lambda x : enc_dict[x.name].fit_transform(x))
ohe.fit(tmp_le_df)

In [None]:
# Sample specific uncertainty intervals for regression models

# clf = sklearn.linear_model.LinearRegression()
# clf.fit(X_train, Y_train)
# Y_predictions = clf.predict(X_test)
# print(clf.coef_, clf.intercept_)

#--------------------------------------------------------------------------------

# ##Train/Test prediction intervals using statsmodel. Use bootstrap or below approaches
# #Theory: https://onlinecourses.science.psu.edu/stat414/node/298/
# #http://nbviewer.jupyter.org/gist/thatneat/10286720
# #https://www.learndatasci.com/tutorials/predicting-housing-prices-linear-regression-using-python-pandas-statsmodels/
# #https://www.statsmodels.org/dev/examples/notebooks/generated/ols.html
# #https://stats.stackexchange.com/questions/183230/bootstrapping-confidence-interval-from-a-regression-prediction

# Using stats model

# import statsmodels.api as sm
# from statsmodels.sandbox.regression.predstd import wls_prediction_std
# re = sm.OLS(Y_train, X_train).fit()
# print(re.summary())
# print('-'*80)
# prstd, iv_l, iv_u = wls_prediction_std(re)
# print(prstd, iv_l, iv_u)
# print('-'*80)
# prstd, iv_l, iv_u = wls_prediction_std(re, X_train.iloc[0:2]) #Test set can be invoked through this approach
# print(prstd, iv_l, iv_u)

# Using bootstrap 

from sklearn.ensemble import BaggingRegressor
n_estimators = 50
model = BaggingRegressor(sklearn.linear_model.LinearRegression(), n_estimators=n_estimators, bootstrap=True)
model.fit(X_train, Y_train)

#Inspecting uncertainty interval for a prediction at sample test point
test_sample = np.array([[1,2,3]])
res = []
for m in model.estimators_:
    res.append(m.predict(test_sample))
pd.DataFrame(res).describe()

In [None]:
# Embedding based Neural Network Regressor

from keras import layers
from keras.layers.core import Dense, Activation, Flatten
from keras.layers import Input, Embedding, merge, Reshape, Dropout
from keras.models import Model
from keras.callbacks import EarlyStopping

categorical_features = ['c1','c2']
numeric_features = []
predict_col_name = 'c3'
all_cols = categorical_features

## Reformatting train/test data for keras model

tmp_X_train, tmp_X_test = tmp_df, tmp_df
tmp_le_train_df = tmp_X_train[categorical_features].apply(lambda x: enc_dict[x.name].transform(x))
tmp_le_test_df = tmp_X_test[categorical_features].apply(lambda x: enc_dict[x.name].transform(x))

tmp_X_train_inp = []
tmp_X_test_inp = []
for c in all_cols:
    tmp_X_train_inp.append(np.array(tmp_le_train_df[c]))
    tmp_X_test_inp.append(np.array(tmp_le_test_df[c]))

print('Train test shape:', tmp_le_train_df.shape, tmp_le_test_df.shape)
tmp_num_feat = {}
for col in categorical_features:
    num_uniq_features = len(enc_dict[col].classes_)
    print (col, num_uniq_features)
    tmp_num_feat.update({col:num_uniq_features})  # Feature embedding based regression model using keras

embedding_size = 5
batch_size = 16

# # early_stopping = EarlyStopping(monitor='val_loss', patience=0)
# tmp_model_save_dir = '/Users/maheshgoud/Desktop/'
# tmp_model_save_filename = tmp_model_save_dir + 'model_tmp.h5'

## Keras model implementation

tmp_input_list = []
tmp_embedding_list = []
for idx, e_col in enumerate(all_cols):
    tmp_input = Input(shape=(1,), dtype='int32', name=e_col)
    tmp_embedding = Embedding(input_dim=tmp_num_feat[e_col], output_dim=embedding_size, input_length=1)(tmp_input)
    tmp_input_list.append(tmp_input)
    tmp_embedding_list.append(tmp_embedding)

x = layers.concatenate(tmp_embedding_list)
x = Reshape((len(tmp_embedding_list)*embedding_size,), name="reshape_one")(x)
x = Dense(64, activation='relu', name="dense_1")(x)
x = Dropout(.1)(x)
x = Dense(32, activation='relu', name="dense_2")(x)
tmp_model_output = Dense(1, activation='relu', name="dense_3")(x)

tmp_final_model = Model(input=tmp_input_list, output=tmp_model_output)
print(tmp_final_model.summary())

# mean_squared_error, mean_absolute_percentage_error, mean_squared_logarithmic_error
tmp_final_model.compile(loss='mean_absolute_percentage_error', optimizer='adadelta') #metrics=['accuracy']
tmp_final_model.fit(tmp_X_train_inp, Y_train.values, epochs=5, batch_size=batch_size, validation_split=0.1) #, callbacks=[early_stopping])

# tmp_final_model.save(tmp_model_save_filename)

Y_predictions = tmp_final_model.predict(tmp_X_test_inp, batch_size=batch_size)
mean_absolute_percentage_error(Y_test.values, Y_predictions) #TODO: Double check

# from ann_visualizer.visualize import ann_viz
# ann_viz(tmp_final_model)

In [None]:
#Dummy data for classification

tmp_df = pd.DataFrame([['a',1,0],['b',3,1],['a',5,0],['b',7,1],['a',9,0],['b',11,1]])
tmp_df.columns = ['c1','c2','c3']
categorical_features = ['c1','c2']

X_train = X_test = pd.get_dummies(tmp_df[categorical_features])
Y_train = Y_test = tmp_df['c3']


In [None]:
#Model evaluation on single train/test split for inspecting confusion matrix

clf = sklearn.linear_model.LogisticRegression(C=1e4)
clf.fit(X_train, Y_train)
Y_predictions = clf.predict(X_test)

#Evaluate predictions
score = 100.0*round(clf.score(X_test, Y_test),4)
print('Accuracy:',score)
print(sklearn.metrics.classification_report(Y_test, Y_predictions, target_names=['(0)','(1)']))
cm = sklearn.metrics.confusion_matrix(Y_test, Y_predictions)
print('ConfusionMatrix\n',cm)

#Color coded confusion matrix
plt.figure(figsize=(4,4))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
title = 'Accuracy Score: {0}'.format(score)
plt.title(title, size = 15)
plt.show()

#scikitplot can be used for roc curves. skipping for now

In [None]:
#tensorflow lr what if tool

In [None]:
#poisson, exp regression with constraints