In [1]:
# Import the ensamble model VotingClassifier
from sklearn.ensemble import VotingClassifier
import casestudy_tools as tools

In [3]:
dt_model = tools.get_decision_tree()

Default Decision Tree Statistics:
Train Accuracy: 0.8180123425044998
Test Accuracy: 0.8158092095395231

Classification Report:
             precision    recall  f1-score   support

          0       0.83      0.95      0.89      5015
          1       0.72      0.42      0.53      1652

avg / total       0.80      0.82      0.80      6667

Number of nodes in the decision tree: 31


In [5]:
nn_model = tools.get_neural_networks_model()

Process ForkPoolWorker-26:
Process ForkPoolWorker-25:
Process ForkPoolWorker-28:


KeyboardInterrupt: 

In [2]:
log_reg_model = tools.get_logistic_regression_model()

Train accuracy: 0.8115196708665466
Test accuracy: 0.8167091645417729
             precision    recall  f1-score   support

          0       0.84      0.94      0.89      5015
          1       0.71      0.44      0.54      1652

avg / total       0.80      0.82      0.80      6667

{'C': 1e-06}


In [18]:
# initialise the classifier with 3 different estimators
ensemble_voting_model = VotingClassifier(estimators=[('dt', dt_model), ('lr', log_reg_model), ('nn', nn_model)], voting='soft')

In [31]:
# Divides a new set of training and testdata from the given dataset.
df = tools.preprocess()

# Building a decision tree using the default settings.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Sets target column to ORGYN
target_dataset = df['ORGYN']
# Removes ORGYN from the dataset in order to avoid false predictor.
dataset = df.drop(['ORGYN'], axis=1)

# Sets random state to 10. This will be kept consistently throughout the case study.
random_state = 10
# Sets the test size to be 30% of the total data set.
test_size = 0.3

# Transform the dataset into a matrix.
dataset_matrix = dataset.as_matrix()

# Splits the data into train and test sets.
dataset_train, dataset_test, target_dataset_train, target_dataset_test = train_test_split(dataset_matrix,
                                                                                          target_dataset,
                                                                                          test_size=test_size,
                                                                                          stratify=target_dataset,
                                                                                          random_state=random_state
                                                                                         )

scaler = StandardScaler()
dataset_train_scaled = scaler.fit_transform(dataset_train, target_dataset_train)
dataset_test_scaled = scaler.transform(dataset_test)


In [38]:
# Needed for accuracy score
from sklearn.metrics import roc_auc_score, classification_report, accuracy_score

# fit the voting classifier to training data
ensemble_voting_model.fit(dataset_train_scaled, target_dataset_train)

# evaluate train and test accuracy
print("Ensemble soft train accuracy:", ensemble_voting_model.score(dataset_train_scaled, target_dataset_train))
print("Ensemble soft test accuracy:", ensemble_voting_model.score(dataset_test_scaled, target_dataset_test))

# evaluate ROC auc score
y_pred_proba_ensemble = ensemble_voting_model.predict_proba(dataset_test_scaled)
roc_index_ensemble = roc_auc_score(target_dataset_test, y_pred_proba_ensemble[:, 1])
print("ROC score of soft voting classifier:", roc_index_ensemble)

y_pred_ensemble = ensemble_voting_model.predict(dataset_test_scaled)
print("Classification Report for Ensemble:")
print(classification_report(target_dataset_test, y_pred_ensemble))



Ensemble soft train accuracy: 0.8176266392388789
Ensemble soft test accuracy: 0.8215089245537723
ROC score of soft voting classifier: 0.8263013622570544
Classification Report for Ensemble:
             precision    recall  f1-score   support

          0       0.84      0.95      0.89      5015
          1       0.74      0.44      0.55      1652

avg / total       0.81      0.82      0.80      6667



  if diff:
  if diff:
  if diff:


<h2>Task 5.2</h2>

In [36]:

y_pred_dt = dt_model.predict(dataset_test)
y_pred_log_reg = log_reg_model.predict(dataset_test_scaled)
y_pred_nn = nn_model.predict(dataset_test_scaled)

print("Accuracy score on test for Decision Tree:", accuracy_score(target_dataset_test, y_pred_dt))
print("Accuracy score on test for logistic regression:", accuracy_score(target_dataset_test, y_pred_log_reg))
print("Accuracy score on test for Neural Networks:", accuracy_score(target_dataset_test, y_pred_nn))
print("Accuracy score on test for Ensemble:", accuracy_score(target_dataset_test, y_pred_ensemble))

Accuracy score on test for Decision Tree: 0.8158092095395231
Accuracy score on test for Neural Networks: 0.8189590520473976
Accuracy score on test for Ensemble: 0.8215089245537723


In [39]:
print("Classification Report for Decision Tree:")
print(classification_report(target_dataset_test, y_pred_dt))
print("Classification Report for logistic regression:")
print(classification_report(target_dataset_test, y_pred_log_reg))
print("Classification Report for Neural Networks:")
print(classification_report(target_dataset_test, y_pred_nn))
print("Classification Report for Ensemble:")
print(classification_report(target_dataset_test, y_pred_ensemble))

Classification Report for Decision Tree:
             precision    recall  f1-score   support

          0       0.83      0.95      0.89      5015
          1       0.72      0.42      0.53      1652

avg / total       0.80      0.82      0.80      6667

Classification Report for Neural Networks:
             precision    recall  f1-score   support

          0       0.84      0.94      0.89      5015
          1       0.72      0.44      0.55      1652

avg / total       0.81      0.82      0.80      6667

Classification Report for Ensemble:
             precision    recall  f1-score   support

          0       0.84      0.95      0.89      5015
          1       0.74      0.44      0.55      1652

avg / total       0.81      0.82      0.80      6667



In [43]:

y_pred_proba_dt = dt_model.predict_proba(dataset_test)
y_pred_proba_log_reg = log_reg_model.predict_proba(dataset_test_scaled)
y_pred_proba_nn = nn_model.predict_proba(dataset_test_scaled)
y_pred_proba_ensemble = ensemble_voting_model.predict_proba(dataset_test_scaled)

roc_index_dt = roc_auc_score(target_dataset_test, y_pred_proba_dt[:, 1])
roc_index_log_reg = roc_auc_score(target_dataset_test, y_pred_proba_log_reg[:, 1])
roc_index_nn = roc_auc_score(target_dataset_test, y_pred_proba_nn[:, 1])
roc_index_ensemble = roc_auc_score(target_dataset_test, y_pred_proba_ensemble[:, 1])

print("ROC index on test for DT:", roc_index_dt)
print("ROC index on test for logistic regression:", roc_index_log_reg)
print("ROC index on test for NN:", roc_index_nn)
print("ROC index on test for Ensemble:", roc_index_ensemble)

ROC index on test for DT: 0.8185891478108048
ROC index on test for NN: 0.8170440856606934
ROC index on test for Ensemble: 0.8263013622570544


In [45]:
from sklearn.metrics import roc_curve

# Gets the false positive rate, true positive rate and thresholds used for each model
fpr_dt, tpr_dt, thresholds_dt = roc_curve(target_dataset_test, y_pred_proba_dt[:,1])
fpr_log_reg, tpr_log_reg, thresholds_log_reg = roc_curve(target_dataset_test, y_pred_proba_log_reg[:,1])
fpr_nn, tpr_nn, thresholds_nn = roc_curve(target_dataset_test, y_pred_proba_nn[:,1])
fpr_ensemble, tpr_ensemble, thresholds_ensemble = roc_curve(target_dataset_test, y_pred_proba_ensemble[:,1])

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

# Sets the color to white.
params = {"ytick.color" : "w",
          "xtick.color" : "w",
          "axes.labelcolor" : "w",
          "axes.edgecolor" : "w"}
plt.rcParams.update(params)

plt.plot(fpr_dt, tpr_dt, label='ROC Curve for DT {:.3f}'.format(roc_index_dt), color='red', lw=0.5)
plt.plot(fpr_log_reg, tpr_log_reg, label='ROC Curve for Log reg {:.3f}'.format(roc_index_log_reg), color='green', lw=0.5)
plt.plot(fpr_nn, tpr_nn, label='ROC Curve for NN {:.3f}'.format(roc_index_nn), color='darkorange', lw=0.5)
plt.plot(fpr_ensemble, tpr_ensemble, label='ROC Curve for Ensemble {:.3f}'.format(roc_index_ensemble), color='yellow', lw=0.5)

plt.plot([0, 1], [0, 1], color='navy', lw=0.5, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC chart for predicting organic purchase', color="w")
plt.legend(loc="lower right")
plt.show()

NameError: name 'fpr_dt' is not defined