In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import itertools as it

from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, cross_validate
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, roc_curve, auc, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, cross_val_predict, RandomizedSearchCV
from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.impute import KNNImputer
from sklearn import preprocessing
from sklearn import metrics
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_regression
import warnings
np.warnings.filterwarnings('ignore')

In [2]:
# supress output
%%capture
!git clone --single-branch --branch v0.2dev https://github.com/scikit-learn-contrib/py-earth.git
%cd py-earth
!python setup.py install --cythonize

In [2]:
from pyearth import Earth

In [17]:
# Loading test and train data
train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')

In [18]:
train = train.drop(['Unnamed: 0'],axis=1)
test = test.drop(['Unnamed: 0'],axis=1)

In [19]:
# Separate into test and train
y_train = train.target
X_train = train.drop("target", axis = 1)

y_test = test.target
X_test = test.drop('target', axis = 1)

X_train_columns = X_train.columns
X_test_columns = X_test.columns

In [20]:
# Creating dummy variables
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

In [21]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train_columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns = X_train_columns)

### **Feature Selection**

In [None]:
# MARS model with degree 1 to help select the useful variables
model = Earth(max_terms=1000, max_degree=1, feature_importance_type='rss')
model.fit(X_train,y_train)

In [None]:
importance = pd.DataFrame(model.feature_importances_, columns = ['RSS'])
predictors = pd.DataFrame(X_train.columns, columns = ['Predictors'])
importance_df = pd.concat([predictors, importance], axis = 1).sort_values(by = 'RSS', ascending = False)
importance_df_small = importance_df.head(37)

In [None]:
selected_features = importance_df_small.Predictors.tolist()

In [None]:
# Making a new dataframe with all the predictors that are selected
X_train_small = X_train.loc[:, selected_features]

In [None]:
# Making a new dataframe with all the predictors that are selected
X_test_small = X_test.loc[:, selected_features]
X_test_small.shape

### **Separate into test and train data**

In [None]:
X_train, X_test_1, y_train, y_test_1\
= train_test_split(X_train, y_train, test_size = 0.25, random_state = 1)

In [None]:
X_train, X_test_2, y_train, y_test_2\
= train_test_split(X_train, y_train, test_size = 0.33, random_state = 1)

In [None]:
X_train, X_test_3, y_train, y_test_3\
= train_test_split(X_train, y_train, test_size = 0.50, random_state = 1)

In [None]:
# Finding cross-validation error for trees (coarse model)
parameters = {'max_degree': range(1,10,2)}

cv = KFold(n_splits = 5,shuffle=True,random_state=1)

coarse_model = GridSearchCV(Earth(max_terms = 1000), parameters, scoring = ['accuracy', 'recall'],
                            refit = 'recall', n_jobs=-1, verbose=1, cv=cv)

coarse_model.fit(X_train, y_train)

# make the predictions
y_pred = coarse_model.predict(X_test_1)

print('Train accuracy : %.3f'%coarse_model.best_estimator_.score(X_train, y_train))
print('Test accuracy : %.3f'%coarse_model.best_estimator_.score(X_test_1, y_test_1))
print('Best accuracy Through Grid Search : %.3f'%coarse_model.best_score_)

print('Best params for recall')
print(coarse_model.best_params_)

In [None]:
model = Earth(max_terms = 1000, max_degree = )

cross_val_ypred = cross_val_predict(model, X_train, y_train, cv = 5, method = 'predict_proba')

metrics = pd.DataFrame()
i = 0
threshold_hyperparam = np.arange(0, 1.01, 0.001)

for threshold in threshold_hyperparam:
  predicted = cross_val_ypred[:, 1] > threshold
  predicted = predicted.astype(int)

  accuracy = accuracy_score(predicted, y_train)*100
  metrics.loc[i, 'threshold'] = threshold
  metrics.loc[i, 'accuracy'] = accuracy
  metrics.loc[i, 'recall'] = recall_score(y_train, predicted)*100
  
  i = i + 1

In [None]:
metrics.loc[(metrics.accuracy>=85) & (metrics.recall>85), :].sort_values(by = 'recall', ascending = False).iloc[0,:]

In [None]:
model = Earth(max_terms = 1000, max_degree = )

cross_val_ypred = cross_val_predict(model, X_train, y_train, cv = 5, method = 'predict_proba')

model.fit(X_train, y_train)

p, r, thresholds = precision_recall_curve(y_train, cross_val_ypred[:,1])
accuracy_list = []

for t in thresholds:
  y_pred_prob = model.predict_proba(X_train)[:,1]
  y_pred = y_pred_prob > t
  y_pred = y_pred.astype(int)
  accuracy = (accuracy_score(y_pred, y_train))
  accuracy_list.append(accuracy)

def plot_accuracy_recall_vs_threshold(accuracy, recalls, thresholds):
  plt.figure(figsize=(8,8))
  plt.title("Accuracy and Recall Scores as a function of the decision threshold")
  plt.plot(thresholds, accuracy_list, "b--", label = "Accuracy")
  plt.plot(thresholds, recalls[:-1], 'g-', label = "Recall")
  plt.plot(thresholds, accuracy_list, 'o', color = 'blue')
  plt.plot(thresholds, recalls[:-1], 'o', color = 'green')
  plt.ylabel('Score')
  plt.xlabel('Decision Threshold')
  plt.legend(loc='best')
  plt.legend()
plot_accuracy_recall_vs_threshold(accuracy_list, r, thresholds)

In [None]:
# Performance metrics computation for the optimum decision threshold probability
desired_threshold = 0.080000

model.fit(X_train, y_train)

y_pred_prob = model.predict_proba(X_train)[:,1] 

# Classifying observations in the positive class (y = 1) if the predicted probability is greater
# than the desired decision threshold probability
y_pred = y_pred_prob > desired_threshold
y_pred = y_pred.astype(int)

#Computing the accuracy
print("Accuracy: ",accuracy_score(y_pred, y_train)*100)  

#Computing the ROC-AUC
fpr, tpr, auc_thresholds = roc_curve(y_train, y_pred_prob)
print("ROC-AUC: ",auc(fpr, tpr))# AUC of ROC

#Computing the precision and recall
print("Precision: ", precision_score(y_train, y_pred))
print("Recall: ", recall_score(y_train, y_pred))

#Confusion matrix
cm = pd.DataFrame(confusion_matrix(y_train, y_pred), 
                  columns=['Predicted 0', 'Predicted 1'], index = ['Actual 0', 'Actual 1'])
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g');

In [None]:
# Performance metrics computation for the optimum decision threshold probability
desired_threshold = 0.080000

model.fit(X_train, y_train)

y_pred_prob = model.predict_proba(X_test)[:,1] 

# Classifying observations in the positive class (y = 1) if the predicted probability is greater
# than the desired decision threshold probability
y_pred = y_pred_prob > desired_threshold
y_pred = y_pred.astype(int)

#Computing the accuracy
print("Accuracy: ",accuracy_score(y_pred, y_test)*100)  

#Computing the ROC-AUC
fpr, tpr, auc_thresholds = roc_curve(y_test, y_pred_prob)
print("ROC-AUC: ",auc(fpr, tpr))# AUC of ROC

#Computing the precision and recall
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))

#Confusion matrix
cm = pd.DataFrame(confusion_matrix(y_test, y_pred), 
                  columns=['Predicted 0', 'Predicted 1'], index = ['Actual 0', 'Actual 1'])
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g');