# 12. Pre-processing pipes

In [1]:
import warnings
import joblib

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, FunctionTransformer, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
df = pd.read_pickle("./pickles/001.dataframe.data-load.pkl")
df = df[df.columns.drop("price")]
df.head()

Unnamed: 0,listing_id,name,host_id,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,...,minimum_nights,maximum_nights,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable
0,281420,"Beautiful Flat in le Village Montmartre, Paris",1466919,2011-12-03,"Paris, Ile-de-France, France",,,,False,1,...,2,1125,100,10,10,10,10,10,10,False
1,3705183,39 mÃÂ² Paris (Sacre CÃâur),10328771,2013-11-29,"Paris, Ile-de-France, France",,,,False,1,...,2,1125,100,10,10,10,10,10,10,False
2,4082273,"Lovely apartment with Terrace, 60m2",19252768,2014-07-31,"Paris, Ile-de-France, France",,,,False,1,...,2,1125,100,10,10,10,10,10,10,False
3,4797344,Cosy studio (close to Eiffel tower),10668311,2013-12-17,"Paris, Ile-de-France, France",,,,False,1,...,2,1125,100,10,10,10,10,10,10,False
4,4823489,Close to Eiffel Tower - Beautiful flat : 2 rooms,24837558,2014-12-14,"Paris, Ile-de-France, France",,,,False,1,...,2,1125,100,10,10,10,10,10,10,False


## 12.1. MultiLabelBinarizer | `amenities`

In [3]:
classes = [
    "Air conditioning",
    "Bed linens",
    "Breakfast",
    "Cable TV",
    "Carbon monoxide alarm",
    "Coffee maker",
    "Cooking basics",
    "Dedicated workspace",
    "Dishes and silverware",
    "Dishwasher",
    "Dryer",
    "Elevator",
    "Essentials",
    "Extra pillows and blankets",
    "Fire extinguisher",
    "First aid kit",
    "Free parking on premises",
    "Free street parking",
    "Garden or backyard",
    "Gym",
    "Hair dryer",
    "Hangers",
    "Heating",
    "Host greets you",
    "Hot water",
    "Iron",
    "Kitchen",
    "Lock on bedroom door",
    "Long term stays allowed",
    "Luggage dropoff allowed",
    "Microwave",
    "Oven",
    "Paid parking off premises",
    "Patio or balcony",
    "Pool",
    "Private entrance",
    "Refrigerator",
    "Shampoo",
    "Smoke alarm",
    "Stove",
    "TV",
    "Washer",
    "Wifi",
]

In [4]:
mlb = MultiLabelBinarizer(classes=classes)
mlb.fit(df["amenities"])

In [5]:
joblib.dump(mlb, "./pickles/MultiLabelBinarizer.joblib")

['./pickles/MultiLabelBinarizer.joblib']

In [6]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    display(pd.DataFrame(mlb.transform(df["amenities"]), columns=mlb.classes_))

Unnamed: 0,Air conditioning,Bed linens,Breakfast,Cable TV,Carbon monoxide alarm,Coffee maker,Cooking basics,Dedicated workspace,Dishes and silverware,Dishwasher,...,Patio or balcony,Pool,Private entrance,Refrigerator,Shampoo,Smoke alarm,Stove,TV,Washer,Wifi
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279707,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,1,1
279708,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,1
279709,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,1
279710,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,1,1,0,1,1,1


## 12.2. OneHotEncoder | `room_type`

In [7]:
enc_room_type = OneHotEncoder(handle_unknown="error", sparse_output=False)
enc_room_type.fit(df[["room_type"]])

In [8]:
enc_room_type.categories_[0].tolist()

['Entire place', 'Hotel room', 'Private room', 'Shared room']

## 12.3. FunctionTransformer | `host_is_superhost` `host_has_profile_pic` `host_identity_verified` `instant_bookable`

In [9]:
s_bool_columns = df.dtypes[df.dtypes == 'bool'].index
s_bool_columns.to_list()

['host_is_superhost',
 'host_has_profile_pic',
 'host_identity_verified',
 'instant_bookable']

In [10]:
fun_tr = FunctionTransformer(lambda x: x.astype(int))

fun_tr.fit_transform(df[["host_is_superhost", "host_has_profile_pic", "host_identity_verified", "instant_bookable"]])

Unnamed: 0,host_is_superhost,host_has_profile_pic,host_identity_verified,instant_bookable
0,0,1,0,0
1,0,1,1,0
2,0,1,0,0
3,0,1,1,0
4,0,1,0,0
...,...,...,...,...
279707,0,1,1,0
279708,0,1,1,0
279709,0,1,1,0
279710,0,1,1,0


## 12.4. ReviewScoresImputer & SimpleImputer | `review_scores_*`

### 12.4.1. ReviewScoresImputer

In [11]:
review_scores_columns = [
    "review_scores_rating",
    "review_scores_accuracy",
    "review_scores_cleanliness",
    "review_scores_checkin",
    "review_scores_communication",
    "review_scores_location",
    "review_scores_value",
]

In [12]:
class ReviewScoresImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df_review_scores = df[df.isnull().any(axis=1)].loc[:, review_scores_columns]

        df_review_scores["review_scores_rating"] /= 10

        s_review_scores_mean = df_review_scores.mean(axis=1)
        s_review_scores_mean.dropna(inplace=True)
        s_review_scores_mean = np.floor(s_review_scores_mean).astype(int)
        
        X.loc[s_review_scores_mean.index, "review_scores_accuracy":"review_scores_value"] = df.loc[s_review_scores_mean.index, "review_scores_accuracy":"review_scores_value"].fillna({c: s_review_scores_mean for c in df_review_scores.columns})
        X.loc[s_review_scores_mean.index, "review_scores_rating"] = df.loc[s_review_scores_mean.index, "review_scores_rating"].fillna(s_review_scores_mean * 10)

        return X

### 12.4.2. SimpleImputer

In [13]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(df.loc[:, review_scores_columns])

In [14]:
fun_tr_after_simple = FunctionTransformer(lambda x: x.apply(np.floor))

fun_tr_after_simple.fit(df[review_scores_columns])

## 12.5. FunctionTransformer | "latitude" "longitude"

In [15]:
fun_tr_latitude = FunctionTransformer(lambda x: x / 90)

fun_tr_latitude.fit(df["latitude"])

In [16]:
fun_tr_longitude = FunctionTransformer(lambda x: x / 180)

fun_tr_longitude.fit(df["longitude"])

## 12.6. SimpleImputer | `bedrooms` `host_total_listings_count`

In [17]:
imp_bed_host = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_bed_host.fit(df[["host_total_listings_count", "bedrooms"]])

## 12.7. MinMaxScaler | `accommodates` `bedrooms` `host_total_listings_count`

In [18]:
mm_scaler = MinMaxScaler()
columns_mm_scaler = ["accommodates", "bedrooms", "host_total_listings_count"]
mm_scaler.fit(df[columns_mm_scaler])

## 12.8. Pipeline

In [19]:
df.head()

Unnamed: 0,listing_id,name,host_id,host_since,host_location,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_total_listings_count,...,minimum_nights,maximum_nights,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable
0,281420,"Beautiful Flat in le Village Montmartre, Paris",1466919,2011-12-03,"Paris, Ile-de-France, France",,,,False,1,...,2,1125,100,10,10,10,10,10,10,False
1,3705183,39 mÃÂ² Paris (Sacre CÃâur),10328771,2013-11-29,"Paris, Ile-de-France, France",,,,False,1,...,2,1125,100,10,10,10,10,10,10,False
2,4082273,"Lovely apartment with Terrace, 60m2",19252768,2014-07-31,"Paris, Ile-de-France, France",,,,False,1,...,2,1125,100,10,10,10,10,10,10,False
3,4797344,Cosy studio (close to Eiffel tower),10668311,2013-12-17,"Paris, Ile-de-France, France",,,,False,1,...,2,1125,100,10,10,10,10,10,10,False
4,4823489,Close to Eiffel Tower - Beautiful flat : 2 rooms,24837558,2014-12-14,"Paris, Ile-de-France, France",,,,False,1,...,2,1125,100,10,10,10,10,10,10,False


In [20]:
classes = [
    "Air conditioning",
    "Bed linens",
    "Breakfast",
    "Cable TV",
    "Carbon monoxide alarm",
    "Coffee maker",
    "Cooking basics",
    "Dedicated workspace",
    "Dishes and silverware",
    "Dishwasher",
    "Dryer",
    "Elevator",
    "Essentials",
    "Extra pillows and blankets",
    "Fire extinguisher",
    "First aid kit",
    "Free parking on premises",
    "Free street parking",
    "Garden or backyard",
    "Gym",
    "Hair dryer",
    "Hangers",
    "Heating",
    "Host greets you",
    "Hot water",
    "Iron",
    "Kitchen",
    "Lock on bedroom door",
    "Long term stays allowed",
    "Luggage dropoff allowed",
    "Microwave",
    "Oven",
    "Paid parking off premises",
    "Patio or balcony",
    "Pool",
    "Private entrance",
    "Refrigerator",
    "Shampoo",
    "Smoke alarm",
    "Stove",
    "TV",
    "Washer",
    "Wifi",
]

In [21]:
# https://stackoverflow.com/a/68420872/13165967
class ColumnDropperTransformer():
    def __init__(self,columns):
        self.columns=columns

    def transform(self,X,y=None):
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None):
        return self 

In [22]:
# https://stackoverflow.com/a/46619402/13165967
class MyMultiLabelBinarizer(BaseEstimator, TransformerMixin):
    def fit(self, x, y=0):
        self.encoder = MultiLabelBinarizer(classes=classes)
        self.encoder.fit(x)
        return self
    def transform(self, x, y=0):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            return self.encoder.transform(x.iloc[:, 0])

In [23]:
class LastToDataFrameTransformer():
    def __init__(self):
        pass
    def transform(self, X, y=None):
        df = pd.DataFrame(X, columns=[
            "review_scores_rating",
            "review_scores_accuracy",
            "review_scores_cleanliness",
            "review_scores_checkin",
            "review_scores_communication",
            "review_scores_location",
            "review_scores_value",
            "Entire place",
            "Hotel room",
            "Private room",
            "Shared room",
            "host_is_superhost",
            "host_has_profile_pic",
            "host_identity_verified",
            "instant_bookable",
            "latitude",
            "longitude",
            "",
            "",
            "accommodates",
            "bedrooms",
            "host_total_listings_count",
            *classes,
        ])
        return df.drop(df.columns[17:19], axis=1).reindex(columns=[
            "host_is_superhost",
            "host_total_listings_count",
            "host_has_profile_pic",
            "host_identity_verified",
            "latitude",
            "longitude",
            "accommodates",
            "bedrooms",
            "review_scores_rating",
            "review_scores_accuracy",
            "review_scores_cleanliness",
            "review_scores_checkin",
            "review_scores_communication",
            "review_scores_location",
            "review_scores_value",
            "instant_bookable",
            *classes,
            "Entire place",
            "Hotel room",
            "Private room",
            "Shared room",
        ])

    def fit(self, X, y=None):
        return self 

In [24]:
trans = Pipeline([
    ("ColumnDropperTransformer", ColumnDropperTransformer([
        "listing_id",
        "name",
        "host_id",
        "host_location",
        "host_since",
        "district",
        "host_response_time",
        "host_response_rate",
        "host_acceptance_rate",
        "neighbourhood",
        "city",
        "property_type",
        "minimum_nights",
        "maximum_nights",
    ])),
    ("ReviewScoresImputer", ReviewScoresImputer()),
    ("ColumnTransformer", ColumnTransformer(
        transformers=[
            ("SimpleImputer review_scores_*", Pipeline([
                ("SimpleImputer", SimpleImputer(missing_values=np.nan, strategy='mean')),
                ("FunctionTransformer", FunctionTransformer(lambda x: np.floor(x))),
            ]), [
                "review_scores_rating",
                "review_scores_accuracy",
                "review_scores_cleanliness",
                "review_scores_checkin",
                "review_scores_communication",
                "review_scores_location",
                "review_scores_value",
            ]),
            ("OneHotEncoder room_type", OneHotEncoder(handle_unknown="error", sparse_output=False), ["room_type"]),
            ("FunctionTransformer as int", FunctionTransformer(lambda x: x.astype(int)), ["host_is_superhost", "host_has_profile_pic", "host_identity_verified", "instant_bookable"]),
            ("FunctionTransformer latitude", FunctionTransformer(lambda x: x / 90), ["latitude"]),
            ("FunctionTransformer longitude", FunctionTransformer(lambda x: x / 180), ["longitude"]),
            ("SimpleImputer mean", SimpleImputer(missing_values=np.nan, strategy='mean'), ["host_total_listings_count", "bedrooms"]),
            ("MinMaxScaler", MinMaxScaler(), ["accommodates", "bedrooms", "host_total_listings_count"]),
            ("MultiLabelBinarizer", MyMultiLabelBinarizer(), ["amenities"]),
        ],
        remainder="passthrough",
    )),
    ("LastToDataFrameTransformer", LastToDataFrameTransformer()),
])

In [25]:
trans.fit(df)
transformed = trans.transform(df)

with pd.option_context('display.max_rows', 11, 'display.max_columns', None):
    display(transformed)

Unnamed: 0,host_is_superhost,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,bedrooms,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,Air conditioning,Bed linens,Breakfast,Cable TV,Carbon monoxide alarm,Coffee maker,Cooking basics,Dedicated workspace,Dishes and silverware,Dishwasher,Dryer,Elevator,Essentials,Extra pillows and blankets,Fire extinguisher,First aid kit,Free parking on premises,Free street parking,Garden or backyard,Gym,Hair dryer,Hangers,Heating,Host greets you,Hot water,Iron,Kitchen,Lock on bedroom door,Long term stays allowed,Luggage dropoff allowed,Microwave,Oven,Paid parking off premises,Patio or balcony,Pool,Private entrance,Refrigerator,Shampoo,Smoke alarm,Stove,TV,Washer,Wifi,Entire place,Hotel room,Private room,Shared room
0,0.0,0.000138,1.0,0.0,0.543185,0.012963,0.125,0.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
1,0.0,0.000138,1.0,1.0,0.543180,0.013029,0.125,0.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
2,0.0,0.000138,1.0,0.0,0.543124,0.012873,0.125,0.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
3,0.0,0.000138,1.0,1.0,0.542730,0.012810,0.125,0.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,0.000138,1.0,0.0,0.542833,0.012610,0.125,0.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279707,0.0,0.000138,1.0,1.0,0.542522,0.012857,0.125,0.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
279708,0.0,0.000138,1.0,1.0,0.543257,0.012956,0.125,0.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
279709,0.0,0.000138,1.0,1.0,0.543189,0.013051,0.125,0.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
279710,0.0,0.000138,1.0,1.0,0.542965,0.013229,0.125,0.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
