In [1]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [27]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV,train_test_split,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')


In [3]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
glass_identification = fetch_ucirepo(id=42) 
  
# data (as pandas dataframes) 
X = glass_identification.data.features 
y = glass_identification.data.targets 

In [4]:
X.info

<bound method DataFrame.info of           RI     Na    Mg    Al     Si     K    Ca    Ba   Fe
0    1.52101  13.64  4.49  1.10  71.78  0.06  8.75  0.00  0.0
1    1.51761  13.89  3.60  1.36  72.73  0.48  7.83  0.00  0.0
2    1.51618  13.53  3.55  1.54  72.99  0.39  7.78  0.00  0.0
3    1.51766  13.21  3.69  1.29  72.61  0.57  8.22  0.00  0.0
4    1.51742  13.27  3.62  1.24  73.08  0.55  8.07  0.00  0.0
..       ...    ...   ...   ...    ...   ...   ...   ...  ...
209  1.51623  14.14  0.00  2.88  72.61  0.08  9.18  1.06  0.0
210  1.51685  14.92  0.00  1.99  73.06  0.00  8.40  1.59  0.0
211  1.52065  14.36  0.00  2.02  73.42  0.00  8.44  1.64  0.0
212  1.51651  14.38  0.00  1.94  73.61  0.00  8.48  1.57  0.0
213  1.51711  14.23  0.00  2.08  73.36  0.00  8.62  1.67  0.0

[214 rows x 9 columns]>

In [5]:
y['Type_of_glass'].value_counts()

Type_of_glass
2    76
1    70
7    29
3    17
5    13
6     9
Name: count, dtype: int64

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=24,test_size=0.3,stratify=y['Type_of_glass'])
print(y_train['Type_of_glass'].value_counts(normalize=True)*100)
print(y_test['Type_of_glass'].value_counts(normalize=True)*100)

Type_of_glass
2    35.570470
1    32.885906
7    13.422819
3     8.053691
5     6.040268
6     4.026846
Name: proportion, dtype: float64
Type_of_glass
2    35.384615
1    32.307692
7    13.846154
3     7.692308
5     6.153846
6     4.615385
Name: proportion, dtype: float64


In [7]:
lr = LogisticRegression(random_state=24)

In [8]:
kfold = StratifiedKFold(n_splits=5,random_state=24,shuffle=True)

In [9]:
params = {'solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'],'multi_class':['ovr','multinomial'],'C':np.linspace(0.001,10,20)}

In [10]:
gcv = GridSearchCV(lr,param_grid=params,cv=kfold)

In [11]:
gcv.fit(X,y['Type_of_glass'])

In [12]:
gcv.best_params_

{'C': 4.211105263157895, 'multi_class': 'ovr', 'solver': 'newton-cg'}

In [13]:
gcv.best_score_

0.6499446290143964

In [14]:
pd_cv = pd.DataFrame(gcv.cv_results_)
pd_cv.shape

(240, 16)

##### Best Model with Parameters

In [15]:
lr_best = LogisticRegression(random_state=24,C=4.211105263157895,multi_class='ovr',solver='newton-cg')

In [16]:
lr_best.fit(X,y)

##### unlabelled data

In [17]:
tst = pd.read_csv('tst_Glass.csv')

##### Inferencing

In [18]:
y_pred_prob = lr_best.predict_proba(tst)

In [19]:
y_pred_prob.shape

(6, 6)

In [20]:
pd_probs = pd.DataFrame(y_pred_prob,columns=['1','2','3','5','6','7'])
pd_probs

Unnamed: 0,1,2,3,5,6,7
0,2.5e-05,0.615218,0.024276,0.001093,5.805016e-05,0.35933
1,0.414678,0.010982,0.02135,7.3e-05,0.3957238,0.157193
2,0.820931,0.000349,0.000782,0.055205,1.281796e-09,0.122732
3,0.129409,0.000165,1.5e-05,0.380308,2.551941e-13,0.490103
4,0.92692,0.024626,0.011297,0.009805,1.321584e-09,0.027352
5,3.7e-05,0.018272,0.003463,0.031061,1.830827e-08,0.947166


In [39]:
predictions = lr_best.predict(tst)

In [40]:
predictions

array([2, 1, 1, 7, 1, 7], dtype=int64)

##### Simpler method 

In [41]:
best_model = gcv.best_estimator_

In [42]:
best_model.predict_proba(tst)

array([[2.48844460e-05, 6.15218132e-01, 2.42757047e-02, 1.09289469e-03,
        5.80501619e-05, 3.59330334e-01],
       [4.14677714e-01, 1.09822046e-02, 2.13503733e-02, 7.26242320e-05,
        3.95723808e-01, 1.57193275e-01],
       [8.20931197e-01, 3.49401009e-04, 7.82255221e-04, 5.52053068e-02,
        1.28179599e-09, 1.22731839e-01],
       [1.29408600e-01, 1.64712946e-04, 1.50056744e-05, 3.80308284e-01,
        2.55194112e-13, 4.90103398e-01],
       [9.26919780e-01, 2.46255976e-02, 1.12969398e-02, 9.80523286e-03,
        1.32158405e-09, 2.73524486e-02],
       [3.68655132e-05, 1.82724983e-02, 3.46333209e-03, 3.10611262e-02,
        1.83082656e-08, 9.47166160e-01]])

In [43]:
best_model.predict(tst)

array([2, 1, 1, 7, 1, 7], dtype=int64)

In [44]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=24,test_size=0.3)

In [45]:
lr.fit(X_train,y_train)

In [46]:
y_pred = lr.predict(X_test)

In [47]:
print(classification_report(y_test['Type_of_glass'],y_pred))

              precision    recall  f1-score   support

           1       0.50      0.69      0.58        16
           2       0.44      0.63      0.52        19
           3       0.00      0.00      0.00         8
           5       0.67      0.29      0.40         7
           6       0.00      0.00      0.00         3
           7       0.83      0.83      0.83        12

    accuracy                           0.54        65
   macro avg       0.41      0.41      0.39        65
weighted avg       0.48      0.54      0.49        65



##### F1 score

In [48]:
gcv = GridSearchCV(lr,param_grid=params,cv=kfold,scoring='f1_macro')

In [49]:
gcv.fit(X,y['Type_of_glass'])

In [50]:
gcv.best_params_

{'C': 8.947473684210527, 'multi_class': 'ovr', 'solver': 'newton-cg'}

In [51]:
gcv.best_score_

0.526898505456462

In [53]:
lr_best = LogisticRegression(random_state=24,C=8.947473684210527,multi_class='ovr',solver='newton-cg')

In [54]:
lr_best.fit(X,y)

In [55]:
y_pred_prob = lr_best.predict_proba(tst)

In [56]:
y_pred_prob.shape

(6, 6)

In [57]:
predictions = lr_best.predict(tst)

In [58]:
predictions

array([2, 6, 1, 1, 1, 7], dtype=int64)