In [36]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn import tree
from sklearn.preprocessing import StandardScaler

# Import dataset

In [37]:
df=pd.read_csv('../input/glass/glass.csv')

# Exploratory data analysis

In [38]:
df.head()

In [39]:
df.shape

In [40]:
df.info()

##### Missing values in variables

In [41]:
df.isnull().sum()

In [42]:
df['Type'].value_counts()

In [43]:
sns.countplot(x="Type", data=df, palette='bright')
plt.show()

In [44]:
plt.figure(figsize=(6, 8))
plt.pie(df['Type'].value_counts().values, labels= df['Type'].value_counts().index,autopct='%1.2f%%')
plt.show()

# Frequency distribution of values in variables

In [45]:
num_cols=df._get_numeric_data().columns

print(num_cols)

In [46]:
for col in num_cols:
    fig, axes=plt.subplots(figsize=(8, 6), nrows=1, ncols=2)
    sns.histplot(df[col], ax=axes[0])
    sns.distplot(df[col], ax=axes[1], fit=stats.norm)
    plt.show()

In [47]:
df.describe().T

In [48]:
cor = df.corr()
plt.figure(figsize=(9,6))
sns.heatmap(data = cor, annot = True, cmap = 'PiYG')
plt.show()

### Declare feature vector and target variable

In [49]:
columns=list(df.columns)
columns.remove('Type')

In [50]:
X=df.drop('Type', axis=1)
Y=df['Type']

##### Split data into separate training and test set

In [51]:
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, test_size=0.20, random_state=42)

### Scaler

In [52]:
sc=StandardScaler()
sc.fit(X_train)

X_train=sc.transform(X_train)
X_test=sc.transform(X_test)

In [53]:
X_train=pd.DataFrame(X_train, columns=columns)
X_test=pd.DataFrame(X_test, columns=columns)

# Model Building

##### Decision Tree Classifier with criterion gini index

In [54]:
dt_gini=DecisionTreeClassifier(criterion='gini')
dt_gini.fit(X_train, Y_train)

In [55]:
plt.figure(figsize=(12,8))

tree.plot_tree(dt_gini.fit(X_train, Y_train))
plt.show()

In [56]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(dt_gini,
                   feature_names=X.columns,
                   class_names=Y_train.astype(str),
                   filled=True)

##### Compare the train-set and test-set accuracy

In [57]:
Y_train_pred=dt_gini.predict(X_train)
Y_test_pred=dt_gini.predict(X_test)

print('Training-set accuracy score', accuracy_score(Y_train, Y_train_pred))
print('Testing-set accuracy score', accuracy_score(Y_test, Y_test_pred))

In [58]:
print(classification_report(Y_train, Y_train_pred))

In [59]:
print(classification_report(Y_test, Y_test_pred))

### Overfitting and underfitting

In [60]:
values=[i for i in range(1, 21)]
train_scores, test_scores=[], []

for i in values:
    dt=DecisionTreeClassifier(max_depth=i)
    dt.fit(X_train, Y_train)
    
    Y_tr_pred=dt.predict(X_train)
    Y_te_pred=dt.predict(X_test)
    
    train_scores.append(accuracy_score(Y_train, Y_tr_pred))
    test_scores.append(accuracy_score(Y_test, Y_te_pred))

In [61]:
plt.plot(values, train_scores, '-o', color='red', label='Train score')
plt.plot(values, test_scores, '-o', color='blue', label='Test score')
plt.legend()
plt.show()

# Model Tuning

In [62]:
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'criterion': ["gini", "entropy"]
}

dt=DecisionTreeClassifier()

In [63]:
grid_search = GridSearchCV(estimator=dt, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = 'accuracy')

In [64]:
grid_search.fit(X_train, Y_train)

In [65]:
pd.DataFrame(grid_search.cv_results_).head()

In [66]:
print(grid_search.best_estimator_)

In [67]:
dt_grid=grid_search.best_estimator_
dt_grid.fit(X_train, Y_train)

In [68]:
Y_train_pred=dt_grid.predict(X_train)
Y_test_pred=dt_grid.predict(X_test)

print('Training-set accuracy score', accuracy_score(Y_train, Y_tr_pred))
print('Testing-set accuracy score', accuracy_score(Y_test, Y_te_pred))

In [69]:
print(classification_report(Y_train, Y_train_pred))

In [70]:
print(classification_report(Y_test, Y_test_pred))