In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
!pip install --upgrade scikit-learn==0.20.3
!pip install pydotplus
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

from matplotlib import pyplot


    
def plot_trees(model):
    estimators = gradient_boosting_model.estimators_
    for i in range(len(estimators)):
        tree.plot_tree(estimators[i][0])
        pyplot.show()
        #plot_model(new_X, new_y, estimators[i][0])
        
def plot_regressor(model, features, labels):
    x = np.linspace(0,85,1000)
    pyplot.scatter(features, labels)
    pyplot.plot(x, model.predict(x.reshape([-1,1])))
    pyplot.xlabel("Age")
    pyplot.ylabel("Days per week")
    pyplot.show()
    
from matplotlib import pyplot as plt


In [2]:
# Some functions to plot our points and draw the lines
def plot_points(features, labels, fix_margins=True):
    X = np.array(features)
    y = np.array(labels)
    spam = X[np.argwhere(y==1)]
    ham = X[np.argwhere(y==0)]
    if fix_margins:
        pyplot.xlim(0, 11)
        pyplot.ylim(0, 11)
    pyplot.scatter([s[0][0] for s in spam],
                [s[0][1] for s in spam],
                s = 100,
                color = 'cyan',
                edgecolor = 'k',
                marker =  '^')
    pyplot.scatter([s[0][0] for s in ham],
                [s[0][1] for s in ham],
                s = 100,
                color = 'red',
                edgecolor = 'k',
                marker = 's')
    pyplot.xlabel('Lottery')
    pyplot.ylabel('Sale')
    pyplot.legend(['Spam','Ham'])

def plot_model(X, y, model, fix_margins=True):
    X = np.array(X)
    y = np.array(y)
    plot_points(X, y)
    plot_step = 0.01
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    if fix_margins:
        x_min=0
        y_min=0
        x_max=12
        y_max=12
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    pyplot.contourf(xx, yy, Z, colors=['red', 'blue'], alpha=0.2, levels=range(-1,2))
    pyplot.contour(xx, yy, Z,colors = 'k',linewidths = 3)
    pyplot.show()

def display_tree(dt):
    from sklearn.externals.six import StringIO  
    from IPython.display import Image  
    from sklearn.tree import export_graphviz
    import pydotplus
    dot_data = StringIO()
    export_graphviz(dt, out_file=dot_data,  
                    filled=True, rounded=True,
                    special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    return Image(graph.create_png())

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

np.random.seed(0)

# Spam Email dataset
emails = np.array([
    [7,8,1],
    [3,2,0],
    [8,4,1],
    [2,6,0],
    [6,5,1],
    [9,6,1],
    [8,5,0],
    [7,1,0],
    [1,9,1],
    [4,7,0],
    [1,3,0],
    [3,10,1],
    [2,2,1],
    [9,3,0],
    [5,3,0],
    [10,1,0],
    [5,9,1],
    [10,8,1],
])
spam_dataset = pd.DataFrame(data=emails, columns=["Lottery", "Sale", "Spam"])
spam_dataset
        


# **Decision Tree Classifier**

**Max Depth = 2**

In [4]:
features = spam_dataset[['Lottery', 'Sale']]
labels = spam_dataset['Spam']
plot_points(features, labels)

# Decision Tree
dt_clf = DecisionTreeClassifier(max_depth =2, random_state=42)
dt_clf.fit(features, labels)
dt_clf.score(features, labels)

# Draw decision tree
display_tree(dt_clf)



In [5]:
# Decision tree as map
plot_model(features, labels, dt_clf)

**Max Depth = 4**

In [6]:
dt_clf = DecisionTreeClassifier(max_depth = 4 ,random_state=42)
dt_clf.fit(features, labels)
dt_clf.score(features, labels)

# Draw decision tree
display_tree(dt_clf)

In [7]:
# Decision tree as map
plot_model(features, labels, dt_clf)

**Max Depth = 6**

In [8]:
dt_clf = DecisionTreeClassifier(max_depth = 6 ,random_state=42)
dt_clf.fit(features, labels)
dt_clf.score(features, labels)
# Draw decision tree
display_tree(dt_clf)

In [9]:
# Decision tree as map
plot_model(features, labels, dt_clf)

# # From above plots we can see:

* max_depth 2: many missclassified points
* max_depth 4: 2 missclassified points
* max_depth 6: 1 missclassified points

# **Random Forest Classifier**

n_estimators int, default=100
The number of trees in the forest.

In [10]:
# Training a Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=0, n_estimators=10, max_depth=1)
rf_clf.fit(features, labels)
rf_clf.score(features, labels)

y_pred = rf_clf.predict(features)

# plot
plot_model(features, labels, rf_clf)

#Plotting the points
plot_points(features, labels)

In [11]:
from sklearn import metrics
y_test = labels
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [13]:
import seaborn as sns
plt.figure(figsize=(5, 5))
ax = sns.distplot(y_test, hist=False, color="y", label="Actual Value")
sns.distplot(y_pred, hist=False, color="g", label="Predicted Values" , ax=ax)
plt.title('Random Forest Classifier: Actual Vs Fitted')
plt.show()
plt.close()

In [14]:
rf_clf = RandomForestClassifier(random_state=0, n_estimators=20, max_depth=2)
rf_clf.fit(features, labels)
rf_clf.score(features, labels)
y_pred = rf_clf.predict(features)
# plot
plot_model(features, labels, rf_clf)

In [15]:
# Make predictions on test data
y_pred = rf_clf.predict(features)
# Performance metrics
errors = abs(y_pred - labels)

print('Average absolute error:', round(np.mean(errors), 2), 'degrees.')
# Calculate mean absolute percentage error (MAPE)
mean_absolute_percentage_error = np.mean(100 * (errors / labels))
# Calculate and display accuracy
accuracy = 100 - mean_absolute_percentage_error
print('Accuracy: ', round(accuracy, 2), '%.')

# **Adaboost Classifier**

n_estimatorsint, default=50
The maximum number of estimators at which boosting is terminated. In case of perfect fit, the learning procedure is stopped early.

In [16]:
# Training a AdaBoost
!pip install scipy
from sklearn.ensemble import AdaBoostClassifier

ab_clf = AdaBoostClassifier(n_estimators=100, random_state=0)
ab_clf.fit(features, labels)

ab_clf.score(features, labels)

plot_model(features, labels, ab_clf)
plot_points(features, labels)

In [19]:
# Make predictions on test data
y_pred = ab_clf.predict(features)

# Performance metrics
errors = abs(y_pred - labels)

print('Average absolute error:', round(np.mean(errors), 2), 'degrees.')
# Calculate mean absolute percentage error (MAPE)
mape = np.mean(100 * (errors / labels))
# Calculate and display accuracy
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')

In [20]:
import seaborn as sns
plt.figure(figsize=(5, 5))
ax = sns.distplot(y_test, hist=False, color="r", label="Actual Value")
sns.distplot(y_pred, hist=False, color="g", label="Fitted Values" , ax=ax)
plt.title('AdaBoost Classifications: Actual vs Fitted')
plt.show()
plt.close()

In [21]:
y_pred_nb = y_pred
ytest = labels
from sklearn.metrics import  accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

acc = accuracy_score(ytest, y_pred_nb)
prec = precision_score(ytest, y_pred_nb)
rec = recall_score(ytest, y_pred_nb)
f1 = f1_score(ytest, y_pred_nb)
roc=roc_auc_score(ytest, y_pred_nb)

model= pd.DataFrame([['Adaboost', acc,prec,rec, f1,roc]],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score','ROC'])
model

In [22]:
from sklearn.ensemble import AdaBoostClassifier

ab_clf = AdaBoostClassifier(base_estimator=dt_clf, n_estimators=150, random_state=0)
ab_clf.fit(features, labels)
y_pred = ab_clf.predict([[8,4]])
score  = ab_clf.score(features, labels)
print("Predicted Value\n", y_pred)
print(score)
plot_model(features, labels, ab_clf)

From the above results we can analyse "AdaBoost Classifier" gives best results.