In [3]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OneHotEncoder,OrdinalEncoder
from sklearn.model_selection import GridSearchCV,cross_val_score,train_test_split
import numpy as np

In [4]:
df=pd.read_csv('penguins_size.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [6]:
X=df.drop('species',axis=1)
y=df['species']

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [9]:
numcols=['culmen_length_mm','culmen_depth_mm','flipper_length_mm','body_mass_g']
catcols=['island','sex']

In [10]:
imp=ColumnTransformer([('num',SimpleImputer(missing_values=np.nan,strategy='median'),numcols),
                        ('cat',SimpleImputer(missing_values=np.nan,strategy='most_frequent'),catcols)
                    ])

In [11]:
pre=ColumnTransformer([('numcol',MinMaxScaler(),numcols),
                        ('catcol',OneHotEncoder(),catcols)
                    ])

In [12]:
numcols+catcols

['culmen_length_mm',
 'culmen_depth_mm',
 'flipper_length_mm',
 'body_mass_g',
 'island',
 'sex']

In [13]:
from imblearn.over_sampling import SMOTE

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.base import BaseEstimator, TransformerMixin
class ColumnExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def transform(self, X, *_):
        return pd.DataFrame(X, columns=self.columns)

    def fit(self, *_):
        return self

pipe=Pipeline([('impute',imp),
                ('ce', ColumnExtractor(numcols+catcols)),
                ('preprocess',pre),
                ('model',LogisticRegression())
                ])

In [17]:
ppipe=Pipeline([('impute',imp),
                ('ce', ColumnExtractor(numcols+catcols)),
                ('preprocess',pre),
                ('smot',SMOTE()),
                ('model',LogisticRegression())
                ])

In [None]:
imp.fit(X_train)

In [None]:
X_train=imp.transform(X_train)

In [None]:
X_test=imp.transform(X_test)

In [19]:
pipe.fit(X_train,y_train)

In [None]:
y_pred_train1=pipe.predict(X_train)

In [None]:
y_pred_train1

array(['Adelie', 'Gentoo', 'Gentoo', 'Adelie', 'Chinstrap', 'Gentoo',
       'Adelie', 'Adelie', 'Gentoo', 'Chinstrap', 'Chinstrap', 'Adelie',
       'Chinstrap', 'Adelie', 'Adelie', 'Adelie', 'Gentoo', 'Adelie',
       'Gentoo', 'Adelie', 'Adelie', 'Gentoo', 'Gentoo', 'Adelie',
       'Chinstrap', 'Gentoo', 'Gentoo', 'Chinstrap', 'Gentoo', 'Adelie',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Gentoo', 'Chinstrap',
       'Gentoo', 'Gentoo', 'Gentoo', 'Chinstrap', 'Gentoo', 'Gentoo',
       'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie', 'Adelie',
       'Chinstrap', 'Adelie', 'Adelie', 'Chinstrap', 'Adelie',
       'Chinstrap', 'Adelie', 'Gentoo', 'Gentoo', 'Gentoo', 'Chinstrap',
       'Gentoo', 'Adelie', 'Adelie', 'Chinstrap', 'Adelie', 'Gentoo',
       'Gentoo', 'Adelie', 'Adelie', 'Gentoo', 'Gentoo', 'Adelie',
       'Chinstrap', 'Chinstrap', 'Gentoo', 'Gentoo', 'Chinstrap',
       'Chinstrap', 'Chinstrap', 'Gentoo', 'Adelie', 'Gentoo',
       'Chinstrap', 'Chinstrap', 'Chin

In [None]:
y_pred_train=pipe.predict(X_train)
y_pred_test=pipe.predict(X_test)

In [None]:
from sklearn.metrics import classification_report


print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

      Adelie       1.00      1.00      1.00       109
   Chinstrap       1.00      1.00      1.00        59
      Gentoo       1.00      1.00      1.00        99

    accuracy                           1.00       267
   macro avg       1.00      1.00      1.00       267
weighted avg       1.00      1.00      1.00       267

