# Feature Engineering for Dashboard Readiness




## Objective

* The objective of this stage is to create additional, meaningful features from the cleaned Airbnb datasets that make analysis and dashboard creation easier.

* These features will support clearer comparisons between Berlin and Bangkok and improve the usability of the data in visualization tools.


## Derived Pricing Categories



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

BASE_PATH = "/content/drive/MyDrive/AlmaBetter/Module_4/data"

# Load cleaned datasets
berlin_listings = pd.read_csv(f"{BASE_PATH}/berlin/listings_clean.csv")
berlin_reviews = pd.read_csv(f"{BASE_PATH}/berlin/reviews_clean.csv")
berlin_neighbourhoods = pd.read_csv(f"{BASE_PATH}/berlin/neighbourhoods_clean.csv")

bangkok_listings = pd.read_csv(f"{BASE_PATH}/bangkok/listings_clean.csv")
bangkok_reviews = pd.read_csv(f"{BASE_PATH}/bangkok/reviews_clean.csv")
bangkok_neighbourhoods = pd.read_csv(f"{BASE_PATH}/bangkok/neighbourhoods_clean.csv")


In [9]:
# define price bins and labels
price_bins= [0,50,100,200,float('inf')]
price_labels= ["low","medium","premium","luxury"]
berlin_listings["price_category"]=pd.cut(berlin_listings["price"],bins=price_bins,labels=price_labels,right=False)
bangkok_listings["price_category"] = pd.cut(
    bangkok_listings["price"],
    bins=price_bins,
    labels=price_labels,
    right=False
)
print("berlin_price_category:", berlin_listings["price_category"])
print("bangkok_price_category:", bangkok_listings["price_category"])



berlin_price_category: 0       premium
1       premium
2        medium
3           low
4        luxury
         ...   
9259     medium
9260     medium
9261     medium
9262     medium
9263        low
Name: price_category, Length: 9264, dtype: category
Categories (4, object): ['low' < 'medium' < 'premium' < 'luxury']
bangkok_price_category: 0        luxury
1        luxury
2        luxury
3        luxury
4        luxury
          ...  
23268    luxury
23269    luxury
23270    luxury
23271    luxury
23272    luxury
Name: price_category, Length: 23273, dtype: category
Categories (4, object): ['low' < 'medium' < 'premium' < 'luxury']


## Host Classification



In [15]:
# Classify hosts as single-listing or multi-listing
berlin_listings["host_listing_count"] = (
    berlin_listings.groupby("host_id")["host_id"].transform("count")
)

berlin_listings["host_type"] = berlin_listings["host_listing_count"].apply(
    lambda x: "single" if x == 1 else "multi"
)

bangkok_listings["host_listing_count"] = (
    bangkok_listings.groupby("host_id")["host_id"].transform("count")
)

bangkok_listings["host_type"]= bangkok_listings["host_listing_count"].apply(
    lambda x: "single" if x == 1 else "multi"
)
print("berlin_host_type:", berlin_listings["host_type"])
print("bangkok_host_type:", bangkok_listings["host_type"])


berlin_host_type: 0       single
1       single
2        multi
3       single
4       single
         ...  
9259     multi
9260     multi
9261     multi
9262     multi
9263    single
Name: host_type, Length: 9264, dtype: object
bangkok_host_type: 0        single
1        single
2        single
3         multi
4        single
          ...  
23268     multi
23269     multi
23270     multi
23271     multi
23272     multi
Name: host_type, Length: 23273, dtype: object


## Aggregated Metrics



## Output Datasets