In [89]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.datasets import  load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [90]:
X, y = load_iris(return_X_y=True, as_frame=True)
df = X
df['target'] = y
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 6.0 KB


In [91]:
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['target'], random_state=2024)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 40 to 6
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  120 non-null    float64
 1   sepal width (cm)   120 non-null    float64
 2   petal length (cm)  120 non-null    float64
 3   petal width (cm)   120 non-null    float64
 4   target             120 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 5.6 KB


In [92]:
df_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal length (cm),120.0,5.8525,0.80742,4.3,5.1,5.8,6.4,7.7
sepal width (cm),120.0,3.084167,0.428206,2.0,2.8,3.0,3.4,4.4
petal length (cm),120.0,3.775,1.750642,1.0,1.6,4.35,5.1,6.9
petal width (cm),120.0,1.214167,0.766888,0.1,0.3,1.3,1.8,2.5
target,120.0,1.0,0.81992,0.0,0.0,1.0,2.0,2.0


In [93]:
X_train = df_train.drop(['target'], axis = 1).reset_index(drop=True)
y_train = df_train['target']
X_test  = df_test.drop(['target'], axis = 1).reset_index(drop=True)
y_test =  df_test['target']

In [94]:
num_cols = X_train.select_dtypes('number').columns

In [95]:
num_proc  = Pipeline([
    ('scaler', MinMaxScaler())
])
processor = ColumnTransformer([
    ('num', num_proc, num_cols)
])

In [96]:
clf_knn = Pipeline([
    ('proc', processor),
    ('knn', KNeighborsClassifier())
])

In [97]:
clf_knn.fit(X_train, y_train)

In [98]:
y_pred = clf_knn.predict(X_test)

In [99]:
print(f'accuracy_score: {accuracy_score(y_test, y_pred)}')

accuracy_score: 0.9666666666666667


In [100]:
url = 'https://raw.githubusercontent.com/selva86/' + \
'datasets/master/orange_juice_withmissing.csv'

In [101]:
df_raw = pd.read_csv(url)
df_raw

Unnamed: 0,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
0,CH,237,1.0,1.75,1.99,0.00,0.00,0.0,0.0,0.500000,1.99,1.75,0.24,No,0.000000,0.000000,0.24,1.0
1,CH,239,1.0,1.75,1.99,0.00,0.30,0.0,1.0,0.600000,1.69,1.75,-0.06,No,0.150754,0.000000,0.24,1.0
2,CH,245,1.0,1.86,2.09,0.17,0.00,0.0,0.0,0.680000,2.09,1.69,0.40,No,0.000000,0.091398,0.23,1.0
3,MM,227,1.0,1.69,1.69,0.00,0.00,0.0,0.0,0.400000,1.69,1.69,0.00,No,0.000000,0.000000,0.00,1.0
4,CH,228,7.0,1.69,1.69,0.00,0.00,0.0,0.0,0.956535,1.69,1.69,0.00,Yes,0.000000,0.000000,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,CH,252,7.0,1.86,2.09,0.10,0.00,0.0,0.0,0.587822,2.09,1.76,0.33,Yes,0.000000,0.053763,0.23,0.0
1066,CH,256,7.0,1.86,2.18,0.00,0.00,0.0,0.0,0.670258,2.18,1.86,0.32,Yes,0.000000,0.000000,0.32,0.0
1067,MM,257,7.0,1.86,2.18,0.00,0.00,0.0,0.0,0.736206,2.18,1.86,0.32,Yes,0.000000,0.000000,0.32,0.0
1068,CH,261,7.0,1.86,2.13,0.00,0.24,0.0,0.0,0.588965,1.89,1.86,0.03,Yes,0.112676,0.000000,0.27,0.0


In [102]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Purchase        1070 non-null   object 
 1   WeekofPurchase  1070 non-null   int64  
 2   StoreID         1069 non-null   float64
 3   PriceCH         1069 non-null   float64
 4   PriceMM         1066 non-null   float64
 5   DiscCH          1068 non-null   float64
 6   DiscMM          1066 non-null   float64
 7   SpecialCH       1068 non-null   float64
 8   SpecialMM       1065 non-null   float64
 9   LoyalCH         1065 non-null   float64
 10  SalePriceMM     1065 non-null   float64
 11  SalePriceCH     1069 non-null   float64
 12  PriceDiff       1069 non-null   float64
 13  Store7          1070 non-null   object 
 14  PctDiscMM       1065 non-null   float64
 15  PctDiscCH       1068 non-null   float64
 16  ListPriceDiff   1070 non-null   float64
 17  STORE           1068 non-null   f

In [103]:
display(df_raw.sample(15, random_state=2024))

Unnamed: 0,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
963,MM,229,1.0,1.69,1.69,0.0,0.0,0.0,0.0,0.68,1.69,1.69,0.0,No,0.0,0.0,0.0,1.0
219,CH,272,7.0,1.86,2.13,0.0,0.0,0.0,0.0,0.989732,2.13,1.86,0.27,Yes,0.0,0.0,0.27,0.0
770,MM,229,2.0,1.69,1.69,0.0,0.0,0.0,0.0,0.4,1.69,1.69,0.0,No,0.0,0.0,0.0,2.0
556,MM,275,1.0,1.96,2.13,0.0,0.74,0.0,1.0,0.477037,1.39,1.96,-0.57,No,0.347418,0.0,0.17,1.0
793,MM,257,1.0,1.76,2.18,0.0,0.0,0.0,0.0,0.083886,2.18,1.76,0.42,No,0.0,0.0,0.42,1.0
952,MM,267,2.0,1.86,2.18,0.0,0.4,0.0,1.0,0.003817,1.78,1.86,-0.08,No,0.183486,0.0,0.32,2.0
1,CH,239,1.0,1.75,1.99,0.0,0.3,0.0,1.0,0.6,1.69,1.75,-0.06,No,0.150754,0.0,0.24,1.0
833,MM,241,3.0,1.79,2.23,0.0,0.0,0.0,0.0,0.2048,2.23,1.79,0.44,No,0.0,0.0,0.44,3.0
824,CH,232,4.0,1.79,2.09,0.0,0.0,0.0,0.0,0.867041,2.09,1.79,0.3,No,0.0,0.0,0.3,4.0
138,CH,278,2.0,1.99,2.18,0.0,0.0,0.0,0.0,0.988272,2.18,1.99,0.19,No,0.0,0.0,0.19,2.0


In [104]:
df_interim = (
    df_raw
    .copy()
    .set_axis(
        df_raw.columns.str.replace(' ','_')
        .str.replace(r'\W','',regex=True)
        .str.lower()
        .str.slice(0,40), axis=1
    )
    .drop(['store'], axis=1)
    .drop_duplicates()
    .rename(columns={'purchase':'target'})
    .astype({'storeid':'category',
             'target':'category',
             'store7':'category'}) 
)
    

In [105]:
df_interim

Unnamed: 0,target,weekofpurchase,storeid,pricech,pricemm,discch,discmm,specialch,specialmm,loyalch,salepricemm,salepricech,pricediff,store7,pctdiscmm,pctdiscch,listpricediff
0,CH,237,1.0,1.75,1.99,0.00,0.00,0.0,0.0,0.500000,1.99,1.75,0.24,No,0.000000,0.000000,0.24
1,CH,239,1.0,1.75,1.99,0.00,0.30,0.0,1.0,0.600000,1.69,1.75,-0.06,No,0.150754,0.000000,0.24
2,CH,245,1.0,1.86,2.09,0.17,0.00,0.0,0.0,0.680000,2.09,1.69,0.40,No,0.000000,0.091398,0.23
3,MM,227,1.0,1.69,1.69,0.00,0.00,0.0,0.0,0.400000,1.69,1.69,0.00,No,0.000000,0.000000,0.00
4,CH,228,7.0,1.69,1.69,0.00,0.00,0.0,0.0,0.956535,1.69,1.69,0.00,Yes,0.000000,0.000000,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1065,CH,252,7.0,1.86,2.09,0.10,0.00,0.0,0.0,0.587822,2.09,1.76,0.33,Yes,0.000000,0.053763,0.23
1066,CH,256,7.0,1.86,2.18,0.00,0.00,0.0,0.0,0.670258,2.18,1.86,0.32,Yes,0.000000,0.000000,0.32
1067,MM,257,7.0,1.86,2.18,0.00,0.00,0.0,0.0,0.736206,2.18,1.86,0.32,Yes,0.000000,0.000000,0.32
1068,CH,261,7.0,1.86,2.13,0.00,0.24,0.0,0.0,0.588965,1.89,1.86,0.03,Yes,0.112676,0.000000,0.27


In [106]:
df_train, df_test = train_test_split(df, 
                                     random_state=2024, 
                                     test_size=0.20, 
                                     stratify=df['target'])
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 120 entries, 40 to 6
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  120 non-null    float64
 1   sepal width (cm)   120 non-null    float64
 2   petal length (cm)  120 non-null    float64
 3   petal width (cm)   120 non-null    float64
 4   target             120 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 5.6 KB


In [107]:
display(df_train.describe(include="number").T)
display(df_train.describe(include="category").T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
sepal length (cm),120.0,5.8525,0.80742,4.3,5.1,5.8,6.4,7.7
sepal width (cm),120.0,3.084167,0.428206,2.0,2.8,3.0,3.4,4.4
petal length (cm),120.0,3.775,1.750642,1.0,1.6,4.35,5.1,6.9
petal width (cm),120.0,1.214167,0.766888,0.1,0.3,1.3,1.8,2.5
target,120.0,1.0,0.81992,0.0,0.0,1.0,2.0,2.0


ValueError: No objects to concatenate