In [1]:
# This project is classifying customers according to age and estimated salary to those who might or might not purchase the product
# It uses Decision tree classifier 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [3]:
dataset = pd.read_csv('Social_Network_Ads.csv').drop('User ID', axis=1)
dataset.head()

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [15]:
X = dataset.iloc[:,[1,2]].values
y = dataset.iloc[:,-1].values

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [17]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)



In [18]:
#Checking the p-value through OLS regression
import statsmodels.formula.api as sm
# Add column of ones at the first column of the equation to account for x0= 1
#for the constant number in the linear regression equation
#since the stats model library doesnt take account for b0
#Training the regressor_OLS on all the data set
X_opt= X_train[:,[0,1]]
regressor_OLS=sm.OLS(endog=y_train,exog=X_opt).fit() 
regressor_OLS.summary()


0,1,2,3
Dep. Variable:,y,R-squared:,0.289
Model:,OLS,Adj. R-squared:,0.284
Method:,Least Squares,F-statistic:,60.61
Date:,"Mon, 17 Sep 2018",Prob (F-statistic):,8.209999999999999e-23
Time:,15:49:13,Log-Likelihood:,-223.99
No. Observations:,300,AIC:,452.0
Df Residuals:,298,BIC:,459.4
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.2824,0.030,9.463,0.000,0.224,0.341
x2,0.1286,0.030,4.307,0.000,0.070,0.187

0,1,2,3
Omnibus:,16.553,Durbin-Watson:,0.961
Prob(Omnibus):,0.0,Jarque-Bera (JB):,7.354
Skew:,0.12,Prob(JB):,0.0253
Kurtosis:,2.272,Cond. No.,1.14


In [20]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [22]:
y_pred = classifier.predict(X_test)

In [24]:
from sklearn.metrics import confusion_matrix, classification_report

print('Confusion Matrix: \n', confusion_matrix(y_test, y_pred),'\n')
print('Classification Report: \n', classification_report(y_test, y_pred))

Confusion Matrix: 
 [[59  8]
 [ 7 26]] 

Classification Report: 
              precision    recall  f1-score   support

          0       0.89      0.88      0.89        67
          1       0.76      0.79      0.78        33

avg / total       0.85      0.85      0.85       100



In [None]:
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i),  edgecolors='black', label = j)
plt.title('Decision Tree Classification (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

In [None]:
# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i),  edgecolors='black',label = j)
plt.title('Decision Tree Classification (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()