## One Hot Encoding 
Here we will turn the categorical features of the dataset into numeric features, so we can use the dataset on various machine learning models

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('data/listings_cleaned.csv.gz', index_col=0, compression='gzip')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62122 entries, 56229 to 1307795865634995863
Data columns (total 55 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   name                                          62122 non-null  object 
 1   host_id                                       62122 non-null  int64  
 2   host_since                                    62120 non-null  object 
 3   host_location                                 45702 non-null  object 
 4   host_response_time                            54776 non-null  object 
 5   host_response_rate                            54776 non-null  float64
 6   host_acceptance_rate                          57194 non-null  float64
 7   host_is_superhost                             62122 non-null  bool   
 8   host_listings_count                           62120 non-null  float64
 9   host_total_listings_count                     62

In [9]:
df.select_dtypes(exclude=['float64', 'int64']).info()

<class 'pandas.core.frame.DataFrame'>
Index: 62122 entries, 56229 to 1307795865634995863
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   name                    62122 non-null  object
 1   host_since              62120 non-null  object
 2   host_location           45702 non-null  object
 3   host_response_time      54776 non-null  object
 4   host_is_superhost       62122 non-null  bool  
 5   host_verifications      62120 non-null  object
 6   host_has_profile_pic    62122 non-null  bool  
 7   host_identity_verified  62122 non-null  bool  
 8   neighbourhood_cleansed  62122 non-null  object
 9   property_type           62122 non-null  object
 10  room_type               62122 non-null  object
 11  bathrooms_text          62016 non-null  object
 12  amenities               62122 non-null  object
 13  first_review            47956 non-null  object
 14  last_review             47956 non-null  o

In [13]:
# identify boolean columns
boolean_columns = ['host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'instant_bookable']

# convert boolean columns to integers
df[boolean_columns] = df[boolean_columns].astype(int)

# Check the first few rows
df[boolean_columns].head()

Unnamed: 0_level_0,host_is_superhost,host_has_profile_pic,host_identity_verified,instant_bookable
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
56229,0,1,1,0
62970,1,1,1,1
63948,0,1,1,0
66772,0,1,1,0
69198,0,1,1,0


In [15]:
df.select_dtypes(exclude=['float64', 'int64', 'int32']).info()

<class 'pandas.core.frame.DataFrame'>
Index: 62122 entries, 56229 to 1307795865634995863
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   name                    62122 non-null  object
 1   host_since              62120 non-null  object
 2   host_location           45702 non-null  object
 3   host_response_time      54776 non-null  object
 4   host_verifications      62120 non-null  object
 5   neighbourhood_cleansed  62122 non-null  object
 6   property_type           62122 non-null  object
 7   room_type               62122 non-null  object
 8   bathrooms_text          62016 non-null  object
 9   amenities               62122 non-null  object
 10  first_review            47956 non-null  object
 11  last_review             47956 non-null  object
dtypes: object(12)
memory usage: 6.2+ MB


In [28]:
from sklearn.preprocessing import MultiLabelBinarizer

# Check if 'host_verifications' column exists, if not create it with default empty lists
if 'host_verifications' not in df.columns:
	df['host_verifications'] = [[] for _ in range(len(df))]

# Convert 'host_verifications' from string to list (if stored as a string)
df["host_verifications"] = df["host_verifications"].apply(lambda x: x.strip("[]").replace("'", "").split(", ") if isinstance(x, str) else [])

# Use MultiLabelBinarizer for one-hot encoding
mlb = MultiLabelBinarizer()
verification_encoded = pd.DataFrame(mlb.fit_transform(df["host_verifications"]), columns=mlb.classes_)

# Add encoded columns back to the dataframe
df = pd.concat([df, verification_encoded], axis=1)

# Drop the original 'host_verifications' column
df.drop(columns=["host_verifications"], inplace=True)

# Display the first few rows
df['first_review'].head()

56229    2010-10-26
62970    2011-01-04
63948    2011-09-19
66772    2011-01-14
69198    2012-07-01
Name: first_review, dtype: object