In [25]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.compose import make_column_transformer 
from sklearn.compose import make_column_selector
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss,accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import KNeighborsClassifier



In [26]:
hr = pd.read_csv("HR_comma_sep.csv")
X = hr.drop('left', axis=1)
y = hr['left']


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
        random_state=24, test_size=0.3, stratify=y)

In [28]:
ohe = OneHotEncoder(sparse_output=False, drop='first').set_output(transform='pandas')
scaler_mm = MinMaxScaler()
scaler_std = StandardScaler()
ct = make_column_transformer(('passthrough', make_column_selector(dtype_exclude=object)  ),
                             (ohe, make_column_selector(dtype_include=object) ),
                            verbose_feature_names_out=False).set_output(transform='pandas')
knn = KNeighborsClassifier(n_neighbors=2)
lr = LogisticRegression(random_state=24)
pipe = Pipeline([('CT',ct),('SCL',scaler_mm),('KNN',knn)])


In [29]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))


0.957768392976217


In [30]:
y_pred_prob = pipe.predict_proba(X_test)
print(log_loss(y_test,y_pred_prob))

1.0251614446423376


In [53]:

#### K-FOLDS

kfold = StratifiedKFold(n_splits=4, random_state=24, 
                        shuffle=True)
params = {'KNN__n_neighbors':np.arange(1,11),
          'KNN__metric':['cityblock','minkowski','manhattan','haversine'],
          'SCL':[scaler_mm, scaler_std, None]}
pipe = Pipeline([('CT',ct),('SCL',scaler_mm),('KNN',knn)])

gcv = GridSearchCV(pipe, param_grid=params,
                   scoring='neg_log_loss',
                   cv=kfold, verbose=3)
gcv.fit(X,y)
print(gcv.best_params_)
print(gcv.best_score_)

pd_cv = pd.DataFrame(gcv.cv_results_)
print(pd_cv.shape)

Fitting 4 folds for each of 120 candidates, totalling 480 fits
[CV 1/4] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=nan total time=   0.0s
[CV 2/4] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=nan total time=   0.0s
[CV 3/4] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=nan total time=   0.0s
[CV 4/4] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=MinMaxScaler();, score=nan total time=   0.0s
[CV 1/4] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=nan total time=   0.0s
[CV 2/4] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=nan total time=   0.0s
[CV 3/4] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=nan total time=   0.0s
[CV 4/4] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=StandardScaler();, score=nan total time=   0.0s
[CV 1/4] END KNN__metric=cityblock, KNN__n_neighbors=1, SCL=None;, score=nan tota

ValueError: 
All the 480 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
480 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\Anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\Anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\Anaconda3\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\Anaconda3\Lib\site-packages\sklearn\pipeline.py", line 406, in _fit
    X, fitted_transformer = fit_transform_one_cached(
                            ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\Anaconda3\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\Anaconda3\Lib\site-packages\sklearn\pipeline.py", line 1310, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\Anaconda3\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\Anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\Anaconda3\Lib\site-packages\sklearn\compose\_column_transformer.py", line 968, in fit_transform
    self._validate_column_callables(X)
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\Anaconda3\Lib\site-packages\sklearn\compose\_column_transformer.py", line 534, in _validate_column_callables
    columns = columns(X)
              ^^^^^^^^^^
  File "C:\Users\DAI.STUDENTSDC\AppData\Local\Anaconda3\Lib\site-packages\sklearn\compose\_column_transformer.py", line 1543, in __call__
    raise ValueError(
ValueError: make_column_selector can only be applied to pandas dataframes


## KNN (eg.2)

In [31]:
# Import required libraries
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [37]:
# Dummy data create karte hain (features and labels)
X = np.array([[1, 2], [2, 3], [3, 4], [5, 6], [8, 9], [2, 1], [6, 7], [7, 8]])
y = np.array([0, 0, 0, 1, 1, 0, 1, 1])

In [38]:
# Data ko training aur testing ke liye split karte hain
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [39]:
# KNN model banate hain aur 'k' ka value set karte hain (yahaan k=3 lete hain)
knn = KNeighborsClassifier(n_neighbors=3)

In [40]:
# Model ko train karo (fit)
knn.fit(X_train, y_train)

In [41]:
# Test data par prediction karo
y_pred = knn.predict(X_test)

In [42]:
# Accuracy check karo
accuracy = accuracy_score(y_test, y_pred)
print(f"Model ki accuracy: {accuracy * 100}%")

Model ki accuracy: 0.0%


In [43]:
X_train

array([[7, 8],
       [3, 4],
       [8, 9],
       [5, 6],
       [6, 7]])

In [44]:
X_test

array([[2, 3],
       [2, 1],
       [1, 2]])

In [45]:
y_train

array([1, 0, 1, 1, 1])

In [46]:
y_test

array([0, 0, 0])