In [1]:
import sklearn

In [2]:
sklearn.__version__

'1.5.2'

In [3]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV,train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')


In [4]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [5]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
car_evaluation = fetch_ucirepo(id=19) 
  
# data (as pandas dataframes) 
X = car_evaluation.data.features 
y = car_evaluation.data.targets 
  
# metadata 
print(car_evaluation.metadata) 
  
# variable information 
print(car_evaluation.variables) 


{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'ID': 249, 'type': 'NATIVE', 'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'venue': '8th Intl Workshop on Expert Systems and their Applications, 

In [6]:
X.info

<bound method DataFrame.info of      buying  maint  doors persons lug_boot safety
0     vhigh  vhigh      2       2    small    low
1     vhigh  vhigh      2       2    small    med
2     vhigh  vhigh      2       2    small   high
3     vhigh  vhigh      2       2      med    low
4     vhigh  vhigh      2       2      med    med
...     ...    ...    ...     ...      ...    ...
1723    low    low  5more    more      med    med
1724    low    low  5more    more      med   high
1725    low    low  5more    more      big    low
1726    low    low  5more    more      big    med
1727    low    low  5more    more      big   high

[1728 rows x 6 columns]>

In [7]:
y['class'].value_counts()

class
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

In [8]:
#Converting to binary classification

In [9]:
y['class'] = np.where(y['class'] == 'unacc',0,1)

In [10]:
y['class'].value_counts()

class
0    1210
1     518
Name: count, dtype: int64

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=24,test_size=0.3,stratify=y['class'])

In [12]:
# y_train['class'].value_counts()

In [13]:
# y_test['class'].value_counts()

In [14]:
ohe = OneHotEncoder(drop='first',handle_unknown = 'ignore',sparse_output= False).set_output(transform = 'pandas')
X_trn_ohe = ohe.fit_transform(X_train)
X_tst_ohe = ohe.transform(X_test)

In [15]:
lr = LogisticRegression()
lr.fit(X_trn_ohe,y_train['class'])
y_pred = lr.predict(X_tst_ohe)

In [16]:
accuracy_score(y_test['class'],y_pred)

0.9633911368015414

In [17]:
#Finding percentage

In [18]:
y['class'].value_counts(normalize=True)*100

class
0    70.023148
1    29.976852
Name: proportion, dtype: float64

In [19]:
y_train['class'].value_counts(normalize=True)*100

class
0    70.057899
1    29.942101
Name: proportion, dtype: float64

In [20]:
y_test['class'].value_counts(normalize=True)*100

class
0    69.942197
1    30.057803
Name: proportion, dtype: float64

In [21]:
518 *70.02/100

362.7036

In [22]:
518-362


156

# solver


In [23]:
lr = LogisticRegression(solver='newton-cg')
lr.fit(X_trn_ohe,y_train['class'])
y_pred = lr.predict(X_tst_ohe)
accuracy_score(y_test['class'],y_pred)

0.9633911368015414

In [24]:
lr = LogisticRegression(solver='lbfgs',penalty='l2')
lr.fit(X_trn_ohe,y_train['class'])
y_pred = lr.predict(X_tst_ohe)
accuracy_score(y_test['class'],y_pred)

0.9633911368015414

#### K-Folds CV / StratifiedKFold

In [25]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [26]:
ohe = OneHotEncoder(drop='first',handle_unknown = 'ignore',sparse_output= False)
lr = LogisticRegression(random_state=24)
kfold = StratifiedKFold(n_splits=5,random_state=24,shuffle=True)
pipe = Pipeline([('OHE',ohe),('LR',lr)])
params = {'LR__solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']}
gcv = GridSearchCV(pipe,param_grid=params,cv=kfold)
gcv.fit(X,y)

In [27]:
print(gcv.best_params_)
print(gcv.best_score_)

{'LR__solver': 'lbfgs'}
0.9444483538577533


In [28]:
ohe = OneHotEncoder(drop='first',handle_unknown = 'ignore',sparse_output= False).set_output(transform = 'pandas')
lr = LogisticRegression(multi_class='multinomial',random_state=24)
kfold = StratifiedKFold(n_splits=5,random_state=24,shuffle=True)
pipe = Pipeline([('OHE',ohe),('LR',lr)])
params = {'LR__solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']}
gcv = GridSearchCV(pipe,param_grid=params,cv=kfold)
gcv.fit(X,y)

In [29]:
print(gcv.best_params_)
print(gcv.best_score_)

{'LR__solver': 'lbfgs'}
0.9496590433107146


In [30]:
ohe = OneHotEncoder(drop='first',handle_unknown = 'ignore',sparse_output= False).set_output(transform = 'pandas')
lr = LogisticRegression(multi_class='ovr',random_state=24)
kfold = StratifiedKFold(n_splits=5,random_state=24,shuffle=True)
pipe = Pipeline([('OHE',ohe),('LR',lr)])
params = {'LR__solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga']}
gcv = GridSearchCV(pipe,param_grid=params,cv=kfold)
gcv.fit(X,y)

In [31]:
print(gcv.best_params_)
print(gcv.best_score_)

{'LR__solver': 'lbfgs'}
0.9444483538577533


##### To check if OVR is better or multinomial for a particular problem

In [32]:
ohe = OneHotEncoder(drop='first',handle_unknown = 'ignore',sparse_output= False)
lr = LogisticRegression(random_state=24)
kfold = StratifiedKFold(n_splits=5,random_state=24,shuffle=True)
pipe = Pipeline([('OHE',ohe),('LR',lr)])
params = {'LR__solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'],'LR__multi_class':['ovr','multinomial']}
gcv = GridSearchCV(pipe,param_grid=params,cv=kfold)
gcv.fit(X,y)

In [33]:
print(gcv.best_params_)
print(gcv.best_score_)

{'LR__multi_class': 'multinomial', 'LR__solver': 'lbfgs'}
0.9496590433107146


In [34]:
ohe = OneHotEncoder(drop='first',handle_unknown = 'ignore',sparse_output= False)
lr = LogisticRegression(random_state=24)
kfold = StratifiedKFold(n_splits=5,random_state=24,shuffle=True)
pipe = Pipeline([('OHE',ohe),('LR',lr)])
params = {'LR__solver':['lbfgs','liblinear','newton-cg','newton-cholesky','sag','saga'],'LR__multi_class':['ovr','multinomial'],'LR__C':np.linspace(0.001,10,20)}
gcv = GridSearchCV(pipe,param_grid=params,cv=kfold)
gcv.fit(X,y)

In [35]:
print(gcv.best_params_)
print(gcv.best_score_)

{'LR__C': 2.6323157894736844, 'LR__multi_class': 'ovr', 'LR__solver': 'lbfgs'}
0.9502370779927956


In [36]:
# for checking the fit

In [37]:
pd_cv = pd.DataFrame(gcv.cv_results_)

In [38]:
pd_cv.shape

(240, 16)