# CLASSIFICATION WITH DECISION TREE

We will be using the wine quality data set for these exercises. This data set contains various chemical properties of wine, such as acidity, sugar, pH, and alcohol. It also contains a quality metric (3-9, with highest being better) and a color (red or white). The name of the file is Wine_Quality_Data.csv.

In [None]:
# Importing common libraries 
import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings 
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

### 1. IMPORT DATA

In [None]:
data=pd.read_csv("data/Wine_Quality_Data.csv")

In [None]:
data.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [None]:
data.dtypes

fixed_acidity           float64
volatile_acidity        float64
citric_acid             float64
residual_sugar          float64
chlorides               float64
free_sulfur_dioxide     float64
total_sulfur_dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
color                    object
dtype: object

In [None]:
data.shape

(6497, 13)

In [None]:
data.color.value_counts()

white    4898
red      1599
Name: color, dtype: int64

In [None]:
#data

In [None]:
#split feature and target
X=data.drop('color', axis=1)
y=data.color

### SPLIT TRAIN AND TEST USING STRATIFIEDSHUFFLESPLIT

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

# Split the data into two parts with 1000 points in the test data
# This creates a generator
strat_shuff_split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

# Get the index values from the generator
train_idx, test_idx = next(strat_shuff_split.split(X, y))

# Create the data sets
X_train = data.loc[train_idx, X.columns]
y_train = data.loc[train_idx, 'color']

X_test = data.loc[test_idx, X.columns]
y_test = data.loc[test_idx, 'color']

In [None]:
#Now check the percent composition of each quality level in the train and test data sets. 
y_train.value_counts(normalize=True).sort_index()

red      0.246096
white    0.753904
Name: color, dtype: float64

In [None]:
y_test.value_counts(normalize=True).sort_index()

red      0.246154
white    0.753846
Name: color, dtype: float64

### CREATE DT MODEL

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt = dt.fit(X_train, y_train)

In [None]:
dt.tree_.node_count, dt.tree_.max_depth

(137, 17)

In [None]:
y_pred=dt.predict(X_test)

In [None]:
df_dt = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred.flatten()})
df_dt

Unnamed: 0,Actual,Predicted
5102,white,white
3094,white,white
1360,red,red
1336,red,red
5447,white,white
2474,white,white
3712,white,white
1158,red,red
1148,red,red
991,red,red


In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         red       0.95      0.98      0.97       480
       white       0.99      0.98      0.99      1470

    accuracy                           0.98      1950
   macro avg       0.97      0.98      0.98      1950
weighted avg       0.98      0.98      0.98      1950



### GRIDSEARCHCV  to prune tree

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth':range(1, dt.tree_.max_depth+1, 2),
              'max_features': range(1, len(dt.feature_importances_)+1)}

GR = GridSearchCV(DecisionTreeClassifier(random_state=42),
                  param_grid=param_grid,
                  scoring='accuracy',
                  n_jobs=-1)

GR = GR.fit(X_train, y_train)

In [None]:
GR.best_estimator_

In [None]:
GR.best_estimator_.tree_.node_count

In [None]:
y_best_pred=GR.predict(X_test)

In [None]:
df_gr = pd.DataFrame({'Actual': y_test, 'Predicted': y_best_pred.flatten()})
df_gr

In [None]:
report = classification_report(y_test, y_best_pred)
print(report)

### VISUALIZE TREE AND PRUNED TREE
This activity requires an additional command line program (GraphViz) and Python library (PyDotPlus). GraphViz can be installed with a package manager on Linux and Mac. For PyDotPlus, either `pip` or `conda` (`conda install -c conda-forge pydotplus`) can be used to install the library.

Once these programs are installed:

* Create a visualization of the decision tree, where wine color was predicted and the number of features and/or splits are not limited.


In [None]:
from io import StringIO
from IPython.display import Image, display

from sklearn.tree import export_graphviz

try:
    import pydotplus
    pydotplus_installed = True
    
except:
    print('PyDotPlus must be installed to execute the remainder of the cells associated with this question.')
    print('Please see the instructions for this question for details.')
    pydotplus_installed = False

In [None]:
if pydotplus_installed:
    
    # Create an output destination for the file
    dot_data = StringIO()

    export_graphviz(dt, out_file=dot_data, filled=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

    # View the tree image
    filename = 'wine_tree.png'
    graph.write_png(filename)
    img = Image(filename=filename)
    display(img)
    
else:
    print('This cell not executed because PyDotPlus could not be loaded.')

In [None]:
if pydotplus_installed:
    
    # Create an output destination for the file
    dot_data = StringIO()

    export_graphviz(GR.best_estimator_, out_file=dot_data, filled=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

    # View the tree image
    filename = 'wine_tree_prune.png'
    graph.write_png(filename)
    img = Image(filename=filename) 
    display(img)
    
else:
    print('This cell not executed because PyDotPlus could not be loaded.')

### ENSEMBLING A DECISION TREE IN RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest=RandomForestClassifier(criterion='gini', n_estimators=25, random_state=1, n_jobs=2)
# Criterion: The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “log_loss” and “entropy” both for the Shannon information gain, see Mathematical formulation. Note: This parameter is tree-specific.

In [None]:
forest.fit(X_train, y_train)

In [None]:
y_test_rf=forest.predict(X_test)

In [None]:
df_ensemble = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_rf.flatten()})
df_ensemble

In [None]:
from sklearn.metrics import confusion_matrix
report = classification_report(y_test, y_test_rf)
print (confusion_matrix(y_test,y_test_rf))
print(report)

### DECISION TREE IN REGRESSION

In [None]:
data.head()

In [None]:
#convert object to numerical category
data['color'] = data.color.replace('white',0).replace('red',1).astype(np.int)

In [None]:
#let's predict the alcohol level
X_reg=data.drop('alcohol',axis=1)
y_reg=data.alcohol

In [None]:
from sklearn.model_selection import train_test_split
#training and testing split using all feature
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3)

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr=DecisionTreeRegressor().fit(X_train_reg, y_train_reg)

In [None]:
y_pred_reg=dtr.predict(X_test_reg)

In [None]:
#FUNCTION FOR EVALUATION
import sklearn.metrics as metrics
#evaluation metric
def evaluate(y_actual,y_predict):
    print('R Squared : ',metrics.r2_score(y_actual,y_predict))
    print('Mean Absolute Error : ',metrics.mean_absolute_error(y_actual,y_predict))
    print('Mean Squared Error : ',metrics.mean_squared_error(y_actual,y_predict))
    print('Root Mean Squared Error : ',np.sqrt(metrics.mean_squared_error(y_actual,y_predict)))

In [None]:
evaluate(y_test_reg,y_pred_reg)

In [None]:
df = pd.DataFrame({'Actual': y_test_reg, 'Predicted': y_pred_reg.flatten()})
df

In [None]:
df = df.head(25)
df.plot(kind='bar',figsize=(16,5), title='Actual vs Predict of Alcohol Level')
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.show()