# Creating features with .assign() 

source: https://www.kaggle.com/code/ryanholbrook/creating-features/tutorial

## Import libraries

In [None]:
import numpy as np
import pandas as pd

## Load data

data source: https://www.kaggle.com/code/ryanholbrook/creating-features/data

In [None]:
ames = pd.read_csv("data/ames.csv")

## Data

In [None]:
ames.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YearSold,SaleType,SaleCondition,SalePrice
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141.0,31770.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,0.0,No_Pool,No_Fence,,0.0,5,2010,WD,Normal,215000
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80.0,11622.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,0.0,No_Pool,Minimum_Privacy,,0.0,6,2010,WD,Normal,105000
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81.0,14267.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,0.0,No_Pool,No_Fence,Gar2,12500.0,6,2010,WD,Normal,172000
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93.0,11160.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,...,0.0,No_Pool,No_Fence,,0.0,4,2010,WD,Normal,244000
4,Two_Story_1946_and_Newer,Residential_Low_Density,74.0,13830.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,0.0,No_Pool,Minimum_Privacy,,0.0,3,2010,WD,Normal,189900


## Assign vs dictionary method

In [None]:
ames_bad = ames.copy()
ames_bad['LogSalePrice'] = np.log(ames_bad.SalePrice)
ames_bad.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YearSold,SaleType,SaleCondition,SalePrice,LogSalePrice
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141.0,31770.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Pool,No_Fence,,0.0,5,2010,WD,Normal,215000,12.278393
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80.0,11622.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,No_Pool,Minimum_Privacy,,0.0,6,2010,WD,Normal,105000,11.561716
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81.0,14267.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Pool,No_Fence,Gar2,12500.0,6,2010,WD,Normal,172000,12.05525
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93.0,11160.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,...,No_Pool,No_Fence,,0.0,4,2010,WD,Normal,244000,12.404924
4,Two_Story_1946_and_Newer,Residential_Low_Density,74.0,13830.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,No_Pool,Minimum_Privacy,,0.0,3,2010,WD,Normal,189900,12.154253


In [None]:
(ames
 .assign(LogSalePrice=np.log(ames.SalePrice))
 .head()
)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YearSold,SaleType,SaleCondition,SalePrice,LogSalePrice
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141.0,31770.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Pool,No_Fence,,0.0,5,2010,WD,Normal,215000,12.278393
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80.0,11622.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,No_Pool,Minimum_Privacy,,0.0,6,2010,WD,Normal,105000,11.561716
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81.0,14267.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,No_Pool,No_Fence,Gar2,12500.0,6,2010,WD,Normal,172000,12.05525
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93.0,11160.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,...,No_Pool,No_Fence,,0.0,4,2010,WD,Normal,244000,12.404924
4,Two_Story_1946_and_Newer,Residential_Low_Density,74.0,13830.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,No_Pool,Minimum_Privacy,,0.0,3,2010,WD,Normal,189900,12.154253


### Example

In [None]:
def index_assigment_transform(raw_df):
    df = raw_df.copy()

    # mathematical transforms
    X_1 = pd.DataFrame()  # dataframe to hold new features
    X_1["LivLotRatio"] = df.GrLivArea / df.LotArea
    X_1["Spaciousness"] = (df.FirstFlrSF + df.SecondFlrSF) / df.TotRmsAbvGrd
    X_1["TotalOutsideSF"] = df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + df.Threeseasonporch + df.ScreenPorch

    # interaction with categorical
    X_2 = pd.get_dummies(df.BldgType, prefix="Bldg")
    X_2 = X_2.mul(df.GrLivArea, axis=0)

    # count feature
    X_3 = pd.DataFrame()
    X_3["PorchTypes"] = df[[
        "WoodDeckSF",
        "OpenPorchSF",
        "EnclosedPorch",
        "Threeseasonporch",
        "ScreenPorch",
    ]].gt(0.0).sum(axis=1)

    # string
    X_4 = pd.DataFrame()
    X_4["MSClass"] = df.MSSubClass.str.split("_", n=1, expand=True)[0]

    # groupby
    X_5 = pd.DataFrame()
    X_5["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")

    X_new = df.join([X_1, X_2, X_3, X_4, X_5])

    return X_new

index_assigment_transform(ames).head(10)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Spaciousness,TotalOutsideSF,Bldg_Duplex,Bldg_OneFam,Bldg_Twnhs,Bldg_TwnhsE,Bldg_TwoFmCon,PorchTypes,MSClass,MedNhbdArea
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141.0,31770.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,236.571429,272.0,0.0,1656.0,0.0,0.0,0.0,2,One,1200.0
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80.0,11622.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,179.2,260.0,0.0,896.0,0.0,0.0,0.0,2,One,1200.0
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81.0,14267.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,221.5,429.0,0.0,1329.0,0.0,0.0,0.0,2,One,1200.0
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93.0,11160.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,...,263.75,0.0,0.0,2110.0,0.0,0.0,0.0,0,One,1200.0
4,Two_Story_1946_and_Newer,Residential_Low_Density,74.0,13830.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,271.5,246.0,0.0,1629.0,0.0,0.0,0.0,2,Two,1560.0
5,Two_Story_1946_and_Newer,Residential_Low_Density,78.0,9978.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,229.142857,396.0,0.0,1604.0,0.0,0.0,0.0,2,Two,1560.0
6,One_Story_PUD_1946_and_Newer,Residential_Low_Density,41.0,4920.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,223.0,170.0,0.0,0.0,0.0,1338.0,0.0,1,One,1767.0
7,One_Story_PUD_1946_and_Newer,Residential_Low_Density,43.0,5005.0,Pave,No_Alley_Access,Slightly_Irregular,HLS,AllPub,Inside,...,256.0,226.0,0.0,0.0,0.0,1280.0,0.0,2,One,1767.0
8,One_Story_PUD_1946_and_Newer,Residential_Low_Density,39.0,5389.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,323.2,389.0,0.0,0.0,0.0,1616.0,0.0,2,One,1767.0
9,Two_Story_1946_and_Newer,Residential_Low_Density,60.0,7500.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,257.714286,200.0,0.0,1804.0,0.0,0.0,0.0,2,Two,1560.0


### Chain method

In [None]:
def chain_transform(raw_df):
    X_new = (
        raw_df.copy()
        # mathematical transforms
        .assign(
            LivLotRatio=lambda df: df.GrLivArea / df.LotArea,
            Spaciousness=lambda df: (df.FirstFlrSF + df.SecondFlrSF) / df.TotRmsAbvGrd,
            TotalOutsideSF=lambda df: df[[
                "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", 
                "Threeseasonporch", "ScreenPorch"
            ]].sum(axis=1)
        )
        # interaction with categorical
        .assign(
            **pd.get_dummies(raw_df.BldgType, prefix="Bldg").mul(raw_df.GrLivArea, axis=0)
        )
        # count feature
        .assign(
            PorchTypes=lambda df: df[[
                "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", 
                "Threeseasonporch", "ScreenPorch"
            ]].gt(0.0).sum(axis=1)
        )
        # string
        .assign(
            MSClass=lambda df: df.MSSubClass.str.split("_", n=1, expand=True)[0]
        )
        # groupby
        .assign(
            MedNhbdArea=lambda df: df.groupby("Neighborhood")["GrLivArea"].transform("median")
        )
    )

    return X_new

chain_transform(ames).head(10)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Spaciousness,TotalOutsideSF,Bldg_Duplex,Bldg_OneFam,Bldg_Twnhs,Bldg_TwnhsE,Bldg_TwoFmCon,PorchTypes,MSClass,MedNhbdArea
0,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,141.0,31770.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,236.571429,272.0,0.0,1656.0,0.0,0.0,0.0,2,One,1200.0
1,One_Story_1946_and_Newer_All_Styles,Residential_High_Density,80.0,11622.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,179.2,260.0,0.0,896.0,0.0,0.0,0.0,2,One,1200.0
2,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,81.0,14267.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Corner,...,221.5,429.0,0.0,1329.0,0.0,0.0,0.0,2,One,1200.0
3,One_Story_1946_and_Newer_All_Styles,Residential_Low_Density,93.0,11160.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Corner,...,263.75,0.0,0.0,2110.0,0.0,0.0,0.0,0,One,1200.0
4,Two_Story_1946_and_Newer,Residential_Low_Density,74.0,13830.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,271.5,246.0,0.0,1629.0,0.0,0.0,0.0,2,Two,1560.0
5,Two_Story_1946_and_Newer,Residential_Low_Density,78.0,9978.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,229.142857,396.0,0.0,1604.0,0.0,0.0,0.0,2,Two,1560.0
6,One_Story_PUD_1946_and_Newer,Residential_Low_Density,41.0,4920.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,223.0,170.0,0.0,0.0,0.0,1338.0,0.0,1,One,1767.0
7,One_Story_PUD_1946_and_Newer,Residential_Low_Density,43.0,5005.0,Pave,No_Alley_Access,Slightly_Irregular,HLS,AllPub,Inside,...,256.0,226.0,0.0,0.0,0.0,1280.0,0.0,2,One,1767.0
8,One_Story_PUD_1946_and_Newer,Residential_Low_Density,39.0,5389.0,Pave,No_Alley_Access,Slightly_Irregular,Lvl,AllPub,Inside,...,323.2,389.0,0.0,0.0,0.0,1616.0,0.0,2,One,1767.0
9,Two_Story_1946_and_Newer,Residential_Low_Density,60.0,7500.0,Pave,No_Alley_Access,Regular,Lvl,AllPub,Inside,...,257.714286,200.0,0.0,1804.0,0.0,0.0,0.0,2,Two,1560.0


In [None]:
assert index_assigment_transform(ames).equals(chain_transform(ames))

In [None]:
import timeit

def wrapper(func, *args, **kwargs):
    def wrapped():
        return func(*args, **kwargs)
    return wrapped


raw_df = pd.read_csv("data/ames.csv")
variable = wrapper(index_assigment_transform, raw_df)
chain = wrapper(chain_transform, raw_df)

print(f'Variables took {timeit.timeit(variable, number=100):.2f} seconds')
print(f'Chaining took {timeit.timeit(chain, number=100):.2f} seconds')

Variables took 0.91 seconds
Chaining took 0.94 seconds


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a605a3e6-1564-47b2-94e7-842290ba7692' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>