# Feature Engineering II

After this encounter you should be able to 
- understand and apply transformer classes to your datasets,
- understand and apply pipeline classes to your dataset,

(Concepts are the same as in the intro to FE, how we transform the data is different)

In [1]:
# stuff you know already
import pandas as pd

In [2]:
# new stuff !!
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


### 1. Get Data

In [3]:
df = pd.read_csv("all_penguins_clean.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   studyName            344 non-null    object 
 1   Sample Number        344 non-null    int64  
 2   Species              344 non-null    object 
 3   Region               344 non-null    object 
 4   Island               344 non-null    object 
 5   Stage                344 non-null    object 
 6   Individual ID        344 non-null    object 
 7   Clutch Completion    344 non-null    object 
 8   Date Egg             344 non-null    object 
 9   Culmen Length (mm)   342 non-null    float64
 10  Culmen Depth (mm)    342 non-null    float64
 11  Flipper Length (mm)  342 non-null    float64
 12  Body Mass (g)        342 non-null    float64
 13  Real ID              344 non-null    object 
 14  Sex                  334 non-null    object 
dtypes: float64(4), int64(1), object(10)
memo

In [5]:
df

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Real ID,Sex
0,PAL0708,1,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,A_0,MALE
1,PAL0708,2,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,A_1,FEMALE
2,PAL0708,3,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,A_2,FEMALE
3,PAL0708,4,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,11/16/07,,,,,A_3,
4,PAL0708,5,Adelie,Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,11/16/07,36.7,19.3,193.0,3450.0,A_4,FEMALE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,PAL0910,120,Gentoo,Anvers,Biscoe,"Adult, 1 Egg Stage",N38A2,No,12/1/09,,,,,G_339,
340,PAL0910,121,Gentoo,Anvers,Biscoe,"Adult, 1 Egg Stage",N39A1,Yes,11/22/09,46.8,14.3,215.0,4850.0,G_340,FEMALE
341,PAL0910,122,Gentoo,Anvers,Biscoe,"Adult, 1 Egg Stage",N39A2,Yes,11/22/09,50.4,15.7,222.0,5750.0,G_341,MALE
342,PAL0910,123,Gentoo,Anvers,Biscoe,"Adult, 1 Egg Stage",N43A1,Yes,11/22/09,45.2,14.8,212.0,5200.0,G_342,FEMALE


In [6]:
df = df[
    ~df["Real ID"].isin(["A_3","G_339"])
    ]

In [7]:
df.isna().sum()

studyName              0
Sample Number          0
Species                0
Region                 0
Island                 0
Stage                  0
Individual ID          0
Clutch Completion      0
Date Egg               0
Culmen Length (mm)     0
Culmen Depth (mm)      0
Flipper Length (mm)    0
Body Mass (g)          0
Real ID                0
Sex                    8
dtype: int64

### 2. Train-Test Split

In [8]:
X = df[["Region","Sex", "Flipper Length (mm)", "Body Mass (g)"]]
y = df["Species"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((256, 4), (86, 4), (256,), (86,))

### 3. Explore the Data

### 4. Feature Engineering

In [10]:
numeric = ["Flipper Length (mm)", "Body Mass (g)"]
categoricals = ["Region", "Sex"]

4.1 Use Column Transformers to transform cols of different data types

In [11]:
initial_transformation = ColumnTransformer( # we pass a list of tuples, with a descriptor, the Class we want to call and the col we want to transform
    [
        ("imputer", SimpleImputer(strategy="most_frequent"), ["Sex"]),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False), ["Region"]),
        ("scaler", StandardScaler(), numeric),
    ]
)

In [12]:
X_train_FE = initial_transformation.fit_transform(X_train)


In [13]:
initial_transformation.transform(X_test)

array([['FEMALE', 1.0, 0.5695209023045531, 0.7542782882268587],
       ['MALE', 1.0, -0.15224816200220725, -0.5495598068372238],
       ['FEMALE', 1.0, -0.7296634134476155, -0.39055516109770155],
       ['MALE', 1.0, -0.36877888129423536, 0.2454634218603875],
       ['MALE', 1.0, -0.4409557877249114, 0.11825970526876968],
       ['FEMALE', 1.0, 0.497343995873877, 0.11825970526876968],
       ['MALE', 1.0, -0.00789434914085519, 0.11825970526876968],
       ['FEMALE', 1.0, -0.7296634134476155, -0.6449625942809372],
       ['MALE', 1.0, 1.5078206859033414, 1.962713595847228],
       ['MALE', 1.0, -1.3070786648930237, -0.8357681691683638],
       ['MALE', 1.0, -0.15224816200220725, -0.2633514445060837],
       ['MALE', 1.0, 1.0025823408886092, 1.5175005877765655],
       ['FEMALE', 1.0, 0.5695209023045531, 0.4998708550436231],
       ['MALE', 1.0, 1.0025823408886092, 1.3902968711849477],
       ['FEMALE', 1.0, -0.2244250684328833, -0.4859579485414149],
       ['MALE', 1.0, -0.3687788812942

In [14]:
clf = LogisticRegression()

In [15]:
#clf.fit(X_train_FE, y_train) doesn't work cos we need to further "feature engineer" to account for a linear model

- making predictions with a linear model will raise an error, cos we need to pass numeric data to it. 
- this means that we need to transform the "Sex"-column in addition to imputation (i.e. "one-hot-encoding").
=> using the Pipeline class of sklearn we can do this in one go.

4.2 Use pipelines for applying transformations in steps

In [16]:
pipeline_sex = Pipeline( # again we pass tuples but without the need to specify a col (since we do that in the ColumnTransformer())
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False, drop="if_binary"))
    ]
)

In [17]:
final_transformation = ColumnTransformer(
    [
        ("transform twice Sex-col", pipeline_sex, ["Sex"]),
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse=False), ["Region"]),
        ("scaler", StandardScaler(), numeric),
    ]
)

In [18]:
X_train_FE = final_transformation.fit_transform(X_train)

In [19]:
X_test_FE = final_transformation.transform(X_test)

In [20]:
clf.fit(X_train_FE, y_train)

LogisticRegression()

In [25]:
# clf.predict_proba(X_test_FE)

array([[4.96623617e-03, 2.32063731e-02, 9.71827391e-01],
       [4.88157670e-01, 5.07972636e-01, 3.86969427e-03],
       [6.93176652e-01, 2.84660918e-01, 2.21624303e-02],
       [6.65521064e-01, 3.21082574e-01, 1.33963626e-02],
       [6.93425689e-01, 2.99201731e-01, 7.37258017e-03],
       [2.76075681e-02, 1.39176928e-01, 8.33215504e-01],
       [4.55719814e-01, 5.09074731e-01, 3.52054549e-02],
       [6.81054969e-01, 3.07178672e-01, 1.17663589e-02],
       [1.44041875e-04, 2.27632584e-03, 9.97579632e-01],
       [9.16149562e-01, 8.38320902e-02, 1.83474311e-05],
       [5.12229440e-01, 4.79655182e-01, 8.11537879e-03],
       [4.15996331e-03, 2.55347240e-02, 9.70305313e-01],
       [8.93859628e-03, 4.58748015e-02, 9.45186602e-01],
       [5.58994634e-03, 3.59593468e-02, 9.58450707e-01],
       [3.92606361e-01, 5.06624063e-01, 1.00769576e-01],
       [6.19489320e-01, 3.77834132e-01, 2.67654828e-03],
       [7.91559303e-01, 2.08383066e-01, 5.76311245e-05],
       [5.03478638e-01, 4.81403

In [23]:
clf.score(X_train_FE, y_train) # note how sklearn implements the predictions and the scores them "under the hood", so need to call m.predict()

0.828125

In [24]:
clf.score(X_test_FE, y_test)

0.8372093023255814