In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("data/train.csv", index_col=[0])

In [3]:
r_masks = [
    ["zoning", df["MSZoning"] == "RM"],
    ["lotconfig", df["LotConfig"].isin(["Inside", "FR2"])],
    ["lotarea", (df["LotArea"] > 8000) & (df["LotArea"] < 12_000)]
]

In [4]:
def vectorised_filter(df, input_masks):
    # stack filters into a vector
    masks = [mask for _, mask in input_masks]
    vectored_masks = np.column_stack(masks)

    # calculate how many rowss are removed in each filter (absolute comparison)
    abs_removed = - ((np.sum(vectored_masks, axis=0) - np.size(df, axis=0)) / np.size(df, axis=0) * 100)
        
    # record removal stats in a dict
    filter_stats = {
        name: {
            "absolute": absolute,
            "relative": 0,
        } 
        for name, absolute in zip([name for name, _ in r_masks], - abs_removed)
    }
    
    # calculate how many rows are removed in each filter (relative comparison)
    counter = 1
    previous_feature = None
    for name, _ in r_masks:
        cum_mask = np.logical_and.reduce(vectored_masks[:,:counter], axis=1).sum()
        
        filter_stats[name]["relative"] = (cum_mask - df.shape[0]) / df.shape[0] * 100 if not previous_feature else (cum_mask - previous_remaining) / previous_remaining * 100
        
        previous_remaining = np.logical_and.reduce(vectored_masks[:,:counter], axis=1).sum()
        
        previous_feature = name 
        counter += 1
        
    super_mask = np.logical_and.reduce(masks, axis=0)
    filtered_df = df.loc[super_mask]
    
    return filtered_df, pd.DataFrame(filter_stats).T.round(2)

In [5]:
a, b = vectorised_filter(df, r_masks)

In [6]:
a.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
64,70,RM,50.0,10300,Pave,,IR1,Bnk,AllPub,Inside,...,0,,GdPrv,,0,4,2010,WD,Normal,140000
78,50,RM,50.0,8635,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,1,2008,WD,Normal,127000
107,30,RM,60.0,10800,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,,,Shed,450,8,2007,WD,Normal,100000
145,90,RM,70.0,9100,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,11,2006,ConLI,Abnorml,125000
180,30,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,7,2007,WD,Normal,100000


In [7]:
b.head()

Unnamed: 0,absolute,relative
zoning,-85.07,-85.07
lotconfig,-24.73,-18.35
lotarea,-51.44,-78.65
