In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import xgboost
import cv2
import imblearn

# Data Loading

In [2]:
df=pd.read_csv('../input/glass/glass.csv')

In [3]:
df

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,1
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,1
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,7
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,7
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,7
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,7


In [4]:
df.columns

Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type'], dtype='object')

In [5]:
df.describe()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,1.518365,13.40785,2.684533,1.444907,72.650935,0.497056,8.956963,0.175047,0.057009,2.780374
std,0.003037,0.816604,1.442408,0.49927,0.774546,0.652192,1.423153,0.497219,0.097439,2.103739
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.516523,12.9075,2.115,1.19,72.28,0.1225,8.24,0.0,0.0,1.0
50%,1.51768,13.3,3.48,1.36,72.79,0.555,8.6,0.0,0.0,2.0
75%,1.519157,13.825,3.6,1.63,73.0875,0.61,9.1725,0.0,0.1,3.0
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51,7.0


# Identifing Missing Values

In [6]:
np.sum(df.isnull())

RI      0
Na      0
Mg      0
Al      0
Si      0
K       0
Ca      0
Ba      0
Fe      0
Type    0
dtype: int64

# Identifing No of Classes

In [7]:
df['Type'].unique()

array([1, 2, 3, 5, 6, 7])

In [8]:
X=df.drop('Type', axis=1)
#X=X.drop('RI', axis=1)
X

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0
...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0


In [9]:
Y=df['Type']
Y

0      1
1      1
2      1
3      1
4      1
      ..
209    7
210    7
211    7
212    7
213    7
Name: Type, Length: 214, dtype: int64

# Handling imbalanced data using SMOTETomek

In [10]:
from imblearn.combine import SMOTETomek

In [11]:
smt=SMOTETomek(random_state=42)
x_res, y_res= smt.fit_sample(X,Y)

In [12]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test= train_test_split(x_res,y_res, random_state=42, test_size=0.2, stratify=y_res)
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

347
87
347
87


In [13]:
for x in set(X):
    print('{}....{}'.format(x,len(X[X==x])))

Ca....214
Ba....214
Al....214
Mg....214
K....214
Si....214
Na....214
RI....214
Fe....214


In [14]:
for y in set(Y):
    print('{}....{}'.format(y,len(Y[Y==y])))

1....70
2....76
3....17
5....13
6....9
7....29


In [15]:
parms={
    'n_estimators':[100],
    'max_depth':[7],
    'learning_rate':[1],
    'gamma':[0.1,1,0.5,0],
    'subsample':[0.7],
    'colsample_bylevel':[0.1,0.3],
    'colsample_bytree':[0.3,0.5,0.7],
    'min_child_weight':[0.1,0.3],
    'reg_lambda':[0,1,0.5]
}

#'colsample_bylevel': 0.1,
# 'colsample_bytree': 0.3,
# 'gamma': 0.1,
# 'learning_rate': 1,
# 'max_depth': 7,
# 'min_child_weight': 0.1,
# 'n_estimators': 100,
# 'reg_lambda': 1,
# 'subsample': 0.7

In [16]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score, make_scorer

xg=xgboost.XGBClassifier()
#xg.fit(x_train, y_train)
f1=make_scorer(f1_score, average='macro')
grids=GridSearchCV(xg, param_grid=parms, cv=10, n_jobs=-1, scoring=f1)
grids.fit(x_train, y_train)

GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_job...
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             n_jobs=-1,
             param_grid={'colsample_bylevel': [0

In [17]:
from sklearn.metrics import confusion_matrix

y_train_pred=grids.predict(x_train)
print(confusion_matrix(y_train, y_train_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_train, y_train_pred))

[[54  0  0  0  0  0]
 [ 0 57  0  0  0  0]
 [ 0  0 54  0  0  0]
 [ 0  0  0 61  0  0]
 [ 0  0  0  0 61  0]
 [ 0  0  0  0  0 60]]


              precision    recall  f1-score   support

           1       1.00      1.00      1.00        54
           2       1.00      1.00      1.00        57
           3       1.00      1.00      1.00        54
           5       1.00      1.00      1.00        61
           6       1.00      1.00      1.00        61
           7       1.00      1.00      1.00        60

    accuracy                           1.00       347
   macro avg       1.00      1.00      1.00       347
weighted avg       1.00      1.00      1.00       347



In [18]:
y_test_pred=grids.predict(x_test)
print(confusion_matrix(y_test, y_test_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred))

[[ 8  3  2  0  0  0]
 [ 3  9  1  2  0  0]
 [ 3  1 10  0  0  0]
 [ 0  0  0 14  0  1]
 [ 0  0  0  0 15  0]
 [ 0  0  0  0  0 15]]


              precision    recall  f1-score   support

           1       0.57      0.62      0.59        13
           2       0.69      0.60      0.64        15
           3       0.77      0.71      0.74        14
           5       0.88      0.93      0.90        15
           6       1.00      1.00      1.00        15
           7       0.94      1.00      0.97        15

    accuracy                           0.82        87
   macro avg       0.81      0.81      0.81        87
weighted avg       0.81      0.82      0.81        87



In [19]:
grids.best_score_

0.8972234247234248

In [20]:
grids.best_params_

{'colsample_bylevel': 0.1,
 'colsample_bytree': 0.3,
 'gamma': 0,
 'learning_rate': 1,
 'max_depth': 7,
 'min_child_weight': 0.1,
 'n_estimators': 100,
 'reg_lambda': 1,
 'subsample': 0.7}

# Handling imbalanced data using OVERSAMPLING

In [21]:
from imblearn.over_sampling import RandomOverSampler

In [22]:
rs=RandomOverSampler()

In [23]:
x_res, y_res=rs.fit_sample(X,Y)

In [24]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test= train_test_split(x_res,y_res, random_state=42, test_size=0.2, stratify=y_res)
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

364
92
364
92


In [25]:
for x in set(X):
    print('{}....{}'.format(x,len(X[X==x])))

Ca....214
Ba....214
Al....214
Mg....214
K....214
Si....214
Na....214
RI....214
Fe....214


In [26]:
for y in set(Y):
    print('{}....{}'.format(y,len(Y[Y==y])))

1....70
2....76
3....17
5....13
6....9
7....29


In [27]:
parms={
    'n_estimators':[100],
    'max_depth':[7],
    'learning_rate':[1,0.1,0.5],
    'gamma':[0.1,1,0.5],
    'subsample':[0.7],
    'colsample_bylevel':[0.1],
    'colsample_bytree':[0.3,0.7,0.5],
    'min_child_weight':[0.3,0.7,0.5,1],
    'reg_lambda':[0,1,0.5]
}

#'colsample_bylevel': 0.1,
# 'colsample_bytree': 0.3,
# 'gamma': 0.1,
# 'learning_rate': 1,
# 'max_depth': 7,
# 'min_child_weight': 0.3,
# 'n_estimators': 100,
# 'reg_lambda': 0,
# 'subsample': 0.7}

In [28]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score, make_scorer

xg=xgboost.XGBClassifier()
#xg.fit(x_train, y_train)
f1=make_scorer(f1_score, average='macro')
grids=GridSearchCV(xg, param_grid=parms, cv=10, n_jobs=-1, scoring=f1)
grids.fit(x_train, y_train)

GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_job...
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameters=None,
                                     verbosity=None),
             n_jobs=-1,
             param_grid={'colsample_bylevel': [0

In [29]:
from sklearn.metrics import confusion_matrix

y_train_pred=grids.predict(x_train)
print(confusion_matrix(y_train, y_train_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_train, y_train_pred))

[[61  0  0  0  0  0]
 [ 0 61  0  0  0  0]
 [ 0  0 61  0  0  0]
 [ 0  0  0 60  0  0]
 [ 0  0  0  0 60  0]
 [ 0  0  0  0  0 61]]


              precision    recall  f1-score   support

           1       1.00      1.00      1.00        61
           2       1.00      1.00      1.00        61
           3       1.00      1.00      1.00        61
           5       1.00      1.00      1.00        60
           6       1.00      1.00      1.00        60
           7       1.00      1.00      1.00        61

    accuracy                           1.00       364
   macro avg       1.00      1.00      1.00       364
weighted avg       1.00      1.00      1.00       364



In [30]:
y_test_pred=grids.predict(x_test)
print(confusion_matrix(y_test, y_test_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred))

[[12  1  2  0  0  0]
 [ 2 11  0  1  0  1]
 [ 0  0 15  0  0  0]
 [ 0  0  0 16  0  0]
 [ 0  0  0  0 16  0]
 [ 0  0  0  0  0 15]]


              precision    recall  f1-score   support

           1       0.86      0.80      0.83        15
           2       0.92      0.73      0.81        15
           3       0.88      1.00      0.94        15
           5       0.94      1.00      0.97        16
           6       1.00      1.00      1.00        16
           7       0.94      1.00      0.97        15

    accuracy                           0.92        92
   macro avg       0.92      0.92      0.92        92
weighted avg       0.92      0.92      0.92        92



In [31]:
grids.best_score_

0.9358330558330559

In [32]:
grids.best_params_

{'colsample_bylevel': 0.1,
 'colsample_bytree': 0.5,
 'gamma': 0.5,
 'learning_rate': 0.1,
 'max_depth': 7,
 'min_child_weight': 0.7,
 'n_estimators': 100,
 'reg_lambda': 0,
 'subsample': 0.7}

#Final Model

In [33]:
xg=xgboost.XGBClassifier(colsample_bylevel= 0.1,
                          colsample_bytree= 0.3,
                          gamma= 0.1,
                          learning_rate= 0.1,
                          max_depth= 7,
                          min_child_weight= 0.3,
                          n_estimators= 100,
                          reg_lambda= 0,
                          subsample= 0.7)
xg.fit(x_train, y_train)
y_train_pred=xg.predict(x_train)

In [34]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_train, y_train_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_train, y_train_pred))

[[61  0  0  0  0  0]
 [ 0 61  0  0  0  0]
 [ 0  0 61  0  0  0]
 [ 0  0  0 60  0  0]
 [ 0  0  0  0 60  0]
 [ 0  0  0  0  0 61]]


              precision    recall  f1-score   support

           1       1.00      1.00      1.00        61
           2       1.00      1.00      1.00        61
           3       1.00      1.00      1.00        61
           5       1.00      1.00      1.00        60
           6       1.00      1.00      1.00        60
           7       1.00      1.00      1.00        61

    accuracy                           1.00       364
   macro avg       1.00      1.00      1.00       364
weighted avg       1.00      1.00      1.00       364



In [35]:
from sklearn.model_selection import cross_val_predict

y_train_pred= cross_val_predict(xg, x_train, y_train,cv=5)

In [36]:
print(confusion_matrix(y_train, y_train_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_train, y_train_pred))

[[50  7  4  0  0  0]
 [11 45  0  4  1  0]
 [ 4  1 56  0  0  0]
 [ 0  0  0 58  2  0]
 [ 0  0  0  0 60  0]
 [ 0  0  0  0  0 61]]


              precision    recall  f1-score   support

           1       0.77      0.82      0.79        61
           2       0.85      0.74      0.79        61
           3       0.93      0.92      0.93        61
           5       0.94      0.97      0.95        60
           6       0.95      1.00      0.98        60
           7       1.00      1.00      1.00        61

    accuracy                           0.91       364
   macro avg       0.91      0.91      0.91       364
weighted avg       0.91      0.91      0.91       364



In [37]:
y_test_pred=xg.predict(x_test)
print(confusion_matrix(y_test, y_test_pred))
print()
print()
from sklearn.metrics import classification_report

print(classification_report(y_test, y_test_pred))

[[12  2  1  0  0  0]
 [ 2 11  0  2  0  0]
 [ 0  0 15  0  0  0]
 [ 0  0  0 16  0  0]
 [ 0  0  0  0 16  0]
 [ 0  0  0  0  0 15]]


              precision    recall  f1-score   support

           1       0.86      0.80      0.83        15
           2       0.85      0.73      0.79        15
           3       0.94      1.00      0.97        15
           5       0.89      1.00      0.94        16
           6       1.00      1.00      1.00        16
           7       1.00      1.00      1.00        15

    accuracy                           0.92        92
   macro avg       0.92      0.92      0.92        92
weighted avg       0.92      0.92      0.92        92

