<a href="https://colab.research.google.com/github/Karan1928/Python-for-DS/blob/main/DecisionTreeCompany.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
import pydotplus

In [4]:
company = pd.read_csv('Company_Data.csv')
company

Unnamed: 0,Sales,CompPrice,Income,Advertising,Population,Price,ShelveLoc,Age,Education,Urban,US
0,9.50,138,73,11,276,120,Bad,42,17,Yes,Yes
1,11.22,111,48,16,260,83,Good,65,10,Yes,Yes
2,10.06,113,35,10,269,80,Medium,59,12,Yes,Yes
3,7.40,117,100,4,466,97,Medium,55,14,Yes,Yes
4,4.15,141,64,3,340,128,Bad,38,13,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...
395,12.57,138,108,17,203,128,Good,33,14,Yes,Yes
396,6.14,139,23,3,37,120,Medium,55,11,No,Yes
397,7.41,162,26,12,368,159,Medium,40,18,Yes,Yes
398,5.94,100,79,7,284,95,Bad,50,12,Yes,Yes


In [5]:
#Checking for maximum and minimum values to decide what will be the cut off point
company["Sales"].min()
company["Sales"].max()
company["Sales"].value_counts()

7.80     4
6.67     3
8.77     3
9.32     3
5.87     3
        ..
8.89     1
13.39    1
9.14     1
5.07     1
9.50     1
Name: Sales, Length: 336, dtype: int64

In [6]:
# median
np.median(company["Sales"])
company["sales"]= "<=7.49"
company.loc[company["Sales"]>=7.49,"sales"]=">=7.49"

In [7]:
company["sales"].unique()
company["sales"].value_counts()

>=7.49    201
<=7.49    199
Name: sales, dtype: int64

In [8]:
#Dropping Sales column
company.drop(["Sales"],axis=1,inplace = True)

In [9]:
# Company data has no null values
company.isnull().sum()

CompPrice      0
Income         0
Advertising    0
Population     0
Price          0
ShelveLoc      0
Age            0
Education      0
Urban          0
US             0
sales          0
dtype: int64

In [10]:
company.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   CompPrice    400 non-null    int64 
 1   Income       400 non-null    int64 
 2   Advertising  400 non-null    int64 
 3   Population   400 non-null    int64 
 4   Price        400 non-null    int64 
 5   ShelveLoc    400 non-null    object
 6   Age          400 non-null    int64 
 7   Education    400 non-null    int64 
 8   Urban        400 non-null    object
 9   US           400 non-null    object
 10  sales        400 non-null    object
dtypes: int64(7), object(4)
memory usage: 34.5+ KB


In [11]:
le = preprocessing.LabelEncoder()

In [12]:
for column_name in company.columns:
    if company[column_name].dtype == object:
        company[column_name] = le.fit_transform(company[column_name])
    else:
        pass

In [13]:
features = company.iloc[:,0:10] 
labels = company.iloc[:,10]

In [14]:
x_train,x_test,y_train,y_test = train_test_split(features,labels,test_size=0.3,stratify = labels)

In [15]:
y_train.value_counts()
y_test.value_counts()

1    60
0    60
Name: sales, dtype: int64

In [16]:
model = DT(criterion='entropy') 
model.fit(x_train,y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [17]:
#prediction on Training data
pred_train = pd.DataFrame(model.predict(x_train))

In [18]:
#Finding Accuracy for train data
acc_train = accuracy_score(y_train,pred_train)

In [19]:
print(acc_train)

1.0


In [20]:
# Confusion matrix
confusion_mat = pd.DataFrame(confusion_matrix(y_train,pred_train,))

In [21]:
#prediction on test data
pred_test = pd.DataFrame(model.predict(x_test))

In [22]:
print(pred_test)

     0
0    1
1    1
2    1
3    1
4    1
..  ..
115  1
116  0
117  1
118  1
119  0

[120 rows x 1 columns]


In [23]:
#Accuracy on test data
acc_test = accuracy_score(y_test,pred_test)

In [24]:
print(acc_test)

0.7


In [25]:
#Confusion matrix
confusion_test = pd.DataFrame(confusion_matrix(y_test,pred_test))

In [26]:
#Visualization of decision trees
colnames = list(company.columns)
predictors = colnames[:10]
target = colnames[10]

In [27]:
dot_data = StringIO()

In [28]:
export_graphviz(model,out_file = dot_data, filled =True, rounded = True, feature_names =predictors,class_names = target, impurity = False )
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

In [29]:
#Creating the pdf file of decision tree
graph.write_pdf('company.pdf')


True

In [31]:
#Creating a png file of the decsion tree
graph.write_png('company.png')

True