# Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing Dataset

In [2]:
dataset = pd.read_csv('Social_Network_Ads.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [3]:
dataset.head()

Unnamed: 0,Age,EstimatedSalary,Purchased
0,19,19000,0
1,35,20000,0
2,26,43000,0
3,27,57000,0
4,19,76000,0


# Train and Test split

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Feature Scaling

In [5]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Training and Getting Accuracies

In [6]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
print('Accuracy of RF classifier on training set: {:.2f}'
     .format(classifier.score(X_train, y_train)))
print('Accuracy of RF classifier on test set: {:.2f}'
     .format(classifier.score(X_test, y_test)))

Accuracy of RF classifier on training set: 0.98
Accuracy of RF classifier on test set: 0.91


In [7]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'gini', random_state = 0)
classifier.fit(X_train, y_train)
print('Accuracy of RF classifier on training set: {:.2f}'
     .format(classifier.score(X_train, y_train)))
print('Accuracy of RF classifier on test set: {:.2f}'
     .format(classifier.score(X_test, y_test)))

Accuracy of RF classifier on training set: 0.98
Accuracy of RF classifier on test set: 0.92


In [8]:
from sklearn.ensemble import RandomForestClassifier
for i in range (1,20):
    classifier = RandomForestClassifier(n_estimators = i, criterion = 'gini', random_state = 0).fit(X_train, y_train)
    print('n_estimators = {:.0f}, Accuracy of RF classifier with gini index on training set: {:.2f}'
         .format(i, classifier.score(X_train, y_train)))
    print('n_estimators = {:.0f}, Accuracy of RF classifier with gini index on test set: {:.2f}'
         .format(i, classifier.score(X_test, y_test)))

n_estimators = 1, Accuracy of RF classifier with gini index on training set: 0.93
n_estimators = 1, Accuracy of RF classifier with gini index on test set: 0.89
n_estimators = 2, Accuracy of RF classifier with gini index on training set: 0.95
n_estimators = 2, Accuracy of RF classifier with gini index on test set: 0.88
n_estimators = 3, Accuracy of RF classifier with gini index on training set: 0.96
n_estimators = 3, Accuracy of RF classifier with gini index on test set: 0.92
n_estimators = 4, Accuracy of RF classifier with gini index on training set: 0.96
n_estimators = 4, Accuracy of RF classifier with gini index on test set: 0.93
n_estimators = 5, Accuracy of RF classifier with gini index on training set: 0.97
n_estimators = 5, Accuracy of RF classifier with gini index on test set: 0.92
n_estimators = 6, Accuracy of RF classifier with gini index on training set: 0.98
n_estimators = 6, Accuracy of RF classifier with gini index on test set: 0.93
n_estimators = 7, Accuracy of RF classif

In [9]:
from sklearn.ensemble import RandomForestClassifier
for i in range (1,20):
    classifier = RandomForestClassifier(n_estimators = i, criterion = 'entropy', random_state = 0).fit(X_train, y_train)
    print('n_estimators = {:.0f}, Accuracy of RF classifier with entropy index on training set: {:.2f}'
         .format(i, classifier.score(X_train, y_train)))
    print('n_estimators = {:.0f}, Accuracy of RF classifier with entropy index on test set: {:.2f}'
         .format(i, classifier.score(X_test, y_test)))

n_estimators = 1, Accuracy of RF classifier with entropy index on training set: 0.93
n_estimators = 1, Accuracy of RF classifier with entropy index on test set: 0.88
n_estimators = 2, Accuracy of RF classifier with entropy index on training set: 0.95
n_estimators = 2, Accuracy of RF classifier with entropy index on test set: 0.86
n_estimators = 3, Accuracy of RF classifier with entropy index on training set: 0.96
n_estimators = 3, Accuracy of RF classifier with entropy index on test set: 0.91
n_estimators = 4, Accuracy of RF classifier with entropy index on training set: 0.96
n_estimators = 4, Accuracy of RF classifier with entropy index on test set: 0.91
n_estimators = 5, Accuracy of RF classifier with entropy index on training set: 0.96
n_estimators = 5, Accuracy of RF classifier with entropy index on test set: 0.91
n_estimators = 6, Accuracy of RF classifier with entropy index on training set: 0.97
n_estimators = 6, Accuracy of RF classifier with entropy index on test set: 0.90
n_es

In [10]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 18, criterion = 'gini', random_state = 0).fit(X_train, y_train)
print('n_estimators = {:.0f}, Accuracy of RF classifier with gini index on training set: {:.2f}'
         .format(18, classifier.score(X_train, y_train)))
print('n_estimators = {:.0f}, Accuracy of RF classifier with gini index on test set: {:.2f}'
         .format(18, classifier.score(X_test, y_test)))

n_estimators = 18, Accuracy of RF classifier with gini index on training set: 0.99
n_estimators = 18, Accuracy of RF classifier with gini index on test set: 0.93


In [11]:
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[64  4]
 [ 3 29]]


In [12]:
print(classifier.predict(sc.transform([[30,87000]])))

[0]


# Gradient Boosted Decision Trees

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state = 0)

clf = GradientBoostingClassifier(random_state = 0)
clf.fit(X_train, y_train)

print('Accuracy of GBDT classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of GBDT classifier on test set: {:.2f}\n'
     .format(clf.score(X_test, y_test)))

clf = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 2, random_state = 0)
clf.fit(X_train, y_train)

print('Accuracy of GBDT classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of GBDT classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of GBDT classifier on training set: 0.99
Accuracy of GBDT classifier on test set: 0.81

Accuracy of GBDT classifier on training set: 0.95
Accuracy of GBDT classifier on test set: 0.80


In [14]:
for i in  ([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09]):
    for j in range(1,10):
        clf = GradientBoostingClassifier(learning_rate = i, max_depth = j, random_state = 0)
        clf.fit(X_train, y_train)
        print('learning_rate = {:.2f}, max_depth = {:.0f}, Accuracy of GBDT classifier on training set: {:.2f}'
             .format(i, j, clf.score(X_train, y_train)))
        print('learning_rate = {:.3f}, max_depth = {:.0f}, Accuracy of GBDT classifier on test set: {:.2f}'
             .format(i, j, clf.score(X_test, y_test)))

learning_rate = 0.01, max_depth = 1, Accuracy of GBDT classifier on training set: 0.72
learning_rate = 0.010, max_depth = 1, Accuracy of GBDT classifier on test set: 0.63
learning_rate = 0.01, max_depth = 2, Accuracy of GBDT classifier on training set: 0.95
learning_rate = 0.010, max_depth = 2, Accuracy of GBDT classifier on test set: 0.80
learning_rate = 0.01, max_depth = 3, Accuracy of GBDT classifier on training set: 0.96
learning_rate = 0.010, max_depth = 3, Accuracy of GBDT classifier on test set: 0.79
learning_rate = 0.01, max_depth = 4, Accuracy of GBDT classifier on training set: 0.98
learning_rate = 0.010, max_depth = 4, Accuracy of GBDT classifier on test set: 0.77
learning_rate = 0.01, max_depth = 5, Accuracy of GBDT classifier on training set: 0.99
learning_rate = 0.010, max_depth = 5, Accuracy of GBDT classifier on test set: 0.75
learning_rate = 0.01, max_depth = 6, Accuracy of GBDT classifier on training set: 0.99
learning_rate = 0.010, max_depth = 6, Accuracy of GBDT cla

learning_rate = 0.06, max_depth = 5, Accuracy of GBDT classifier on training set: 1.00
learning_rate = 0.060, max_depth = 5, Accuracy of GBDT classifier on test set: 0.77
learning_rate = 0.06, max_depth = 6, Accuracy of GBDT classifier on training set: 1.00
learning_rate = 0.060, max_depth = 6, Accuracy of GBDT classifier on test set: 0.75
learning_rate = 0.06, max_depth = 7, Accuracy of GBDT classifier on training set: 1.00
learning_rate = 0.060, max_depth = 7, Accuracy of GBDT classifier on test set: 0.76
learning_rate = 0.06, max_depth = 8, Accuracy of GBDT classifier on training set: 1.00
learning_rate = 0.060, max_depth = 8, Accuracy of GBDT classifier on test set: 0.75
learning_rate = 0.06, max_depth = 9, Accuracy of GBDT classifier on training set: 1.00
learning_rate = 0.060, max_depth = 9, Accuracy of GBDT classifier on test set: 0.76
learning_rate = 0.07, max_depth = 1, Accuracy of GBDT classifier on training set: 0.96
learning_rate = 0.070, max_depth = 1, Accuracy of GBDT cla

In [15]:
clf = GradientBoostingClassifier(learning_rate = 0.05, max_depth = 3, random_state = 0)
clf.fit(X_train, y_train)
print('learning_rate = {:.2f}, max_depth = {:.0f}, Accuracy of GBDT classifier on training set: {:.2f}'
             .format(0.05, 3, clf.score(X_train, y_train)))
print('learning_rate = {:.3f}, max_depth = {:.0f}, Accuracy of GBDT classifier on test set: {:.2f}'
             .format(0.05, 3, clf.score(X_test, y_test)))

learning_rate = 0.05, max_depth = 3, Accuracy of GBDT classifier on training set: 0.98
learning_rate = 0.050, max_depth = 3, Accuracy of GBDT classifier on test set: 0.80
