In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("newSpamData.csv")
data.head()

In [None]:
data = data.drop(columns ='Unnamed: 0')
data.head()

In [None]:
X = data.drop(columns=['spam'])
y = data['spam']
print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(X,y,stratify=y)
print(x_train.shape)
print(x_test.shape)

First fitting the normal decision tree without fine tuning and check the results

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtclf = DecisionTreeClassifier(random_state=0)
dtclf.fit(x_train,y_train)
y_train_pred = dtclf.predict(x_train)
y_test_pred = dtclf.predict(x_test)

Visualizing the decision tree

In [None]:
from sklearn import tree

plt.figure(figsize=(20,20))
features = data.columns
classes = ['Not spam','spam']
tree.plot_tree(dtclf,feature_names=features,class_names=classes,filled=True)
plt.show()

In [None]:
from sklearn.metrics import roc_curve,auc
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

# get false +ve and true + rate for different threshold vales
fpr,tpr,thresholds = roc_curve(y_test,y_test_pred)

#area under the curve
roc_auc = auc(fpr,tpr)

# Plot the ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Decision Tree Classifier')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# helper function- to get the confusion matrix
def plot_confusionmatrix(y_train_pred,y_train,dom):
    print(f'{dom} Confusion matrix')
    cf = confusion_matrix(y_train_pred,y_train)
    sns.heatmap(cf,annot=True,yticklabels=classes
                ,xticklabels=classes,cmap='Blues', fmt='g')
    plt.tight_layout()
    plt.show()

In [None]:
from sklearn.metrics import accuracy_score

print(f'Train score {accuracy_score(y_train_pred,y_train)}')
print(f'Test score {accuracy_score(y_test_pred,y_test)}')
plot_confusionmatrix(y_train_pred,y_train,dom='Train')
plot_confusionmatrix(y_test_pred,y_test,dom='Test')

In [None]:
print(confusion_matrix(y_test_pred,y_test))

# Pre pruning

Here we stop the growing of the tree at an early stage by setting constraints
The grid search through parameters is done and the optimum values are chosen

Here following parameters are controled
- maximum depth of the tree
- minimum number of samples needed to split an interval node
- minimum number of samples needed to be a leaf node

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth': [2,4,6,8,10,12],
          'min_samples_split': [2,3,4],
          'min_samples_leaf': [1,2]}

dtclf = DecisionTreeClassifier()
gcv = GridSearchCV(estimator=dtclf,param_grid=params)
gcv.fit(x_train,y_train)

In [None]:
model_dtc = gcv.best_estimator_
model_dtc.fit(x_train,y_train)

y_train_pred = model_dtc.predict(x_train)
y_test_pred = model_dtc.predict(x_test)

print(f'Train score {accuracy_score(y_train_pred,y_train)}')
print(f'Test score {accuracy_score(y_test_pred,y_test)}')
plot_confusionmatrix(y_train_pred,y_train,dom='Train')
plot_confusionmatrix(y_test_pred,y_test,dom='Test')

In [None]:
print(confusion_matrix(y_test_pred,y_test))

In [None]:
plt.figure(figsize=(20,20))
features = data.columns
classes = ['No spam','spam']
tree.plot_tree(model_dtc,feature_names=features,class_names=classes,filled=True)
plt.show()

After pruning there is an improvement in test accuracy.

# Post pruning
For further improvements let's do cost complexity pruning as a post pruning technique to avoid overfitting as decison trees are more likely to get overfitted.

## Cost Complexity pruning

In [None]:
path = dtclf.cost_complexity_pruning_path(x_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
print(ccp_alphas)

In [None]:
plt.scatter(ccp_alphas,impurities)
    plt.plot()
    plt.plot(ccp_alphas,impurities,drawstyle="steps-post")
    plt.xlabel("cost effective alpha values")
    plt.ylabel("total leaf impurity")
    plt.legend()
    plt.title("Total impurity of the leaves vs cost effective alpha- training set")
plt.show()

In [None]:
# For each alpha the model is appended to a list
model_list = []
for ccp_alpha in ccp_alphas:
    clf = tree.DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(x_train, y_train)
    model_list.append(clf)

The last element in models and alpha values are removed as it is a trivial tree with a single node

In [None]:
model_list = model_list[:-1]
ccp_alphas = ccp_alphas[:-1]

dtclf = DecisionTreeClassifier()
dtclf.fit(x_train,y_train)

tree = dtclf.tree_

node_counts = [dtclf.tree_.node_count for model in model_list]
depth = [dtclf.tree_.max_depth for model in model_list]

plt.scatter(ccp_alphas,node_counts)
plt.scatter(ccp_alphas,depth)
plt.plot(ccp_alphas,node_counts,label='no of nodes')
plt.plot(ccp_alphas,depth,label='depth')
plt.legend()
plt.show()

Here the observations are not clear enough

In [None]:
train_acc = []
test_acc = []
for c in model_list:
    y_train_pred = c.predict(x_train)
    y_test_pred = c.predict(x_test)
    train_acc.append(accuracy_score(y_train_pred,y_train))
    test_acc.append(accuracy_score(y_test_pred,y_test))

plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy',drawstyle="steps-post")
plt.plot(ccp_alphas,test_acc,label='test_accuracy',drawstyle="steps-post")
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()

We can choose alpha as = 0.01

In [None]:

clf_ = DecisionTreeClassifier(random_state=42,ccp_alpha=0.01)
clf_.fit(x_train,y_train)
y_train_pred = clf_.predict(x_train)
y_test_pred = clf_.predict(x_test)

print(f'Train score {accuracy_score(y_train_pred,y_train)}')
print(f'Test score {accuracy_score(y_test_pred,y_test)}')
plot_confusionmatrix(y_train_pred,y_train,dom='Train')
plot_confusionmatrix(y_test_pred,y_test,dom='Test')

overfitting is not happening and the performance on the test data have improved

In [None]:
data.head()

In [None]:
features = data.columns[:-1]
classes = ['No spam','Spam']

from sklearn.tree import export_graphviz
import graphviz
dtclf = DecisionTreeClassifier()
dtclf.fit(x_train, y_train)

dot_data = export_graphviz(dtclf, out_file=None, feature_names=features, class_names=classes)
graph = graphviz.Source(dot_data)
graph.format = 'jpg' # set the output format to JPG
graph.render("Spam- non spam decision tree") # creates a PDF file with the visualization

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

predictions=dtclf.predict(x_test)

print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

Here the size of the tree has reduced

In [None]:
from sklearn.metrics import precision_score
from sklearn import metrics

accuracy_dtc =metrics.accuracy_score(y_test,predictions)
prec_dtc = precision_score(y_test, predictions)
Sensitivity_recall_dtc = metrics.recall_score(y_test, predictions) # Sensitivity- how well the model predicts something is positive
Specificity_dtc = metrics.recall_score(y_test, predictions) # Specificity- how well the model predicts something is negative
F_score_dtc = metrics.f1_score(y_test,predictions)

print("accuracy of Decision Tree  model: ",accuracy_dtc)
print("Precision of Decision Tree  model: ",prec_dtc)
print("Sensitivity of Decision Tree  model: ",Sensitivity_recall_dtc)
print("Specificity of Decision Tree model: ",Specificity_dtc)
print("F1 score of Decision Tree  model: ",F_score_dtc)