In [1]:
import pandas as pd
import numpy as np
import folium
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
import os

In [2]:
cwd = os.getcwd()
if os.path.basename(cwd) == "Notebooks":
    project_root = os.path.dirname(cwd)
else:
    project_root = cwd
path = os.path.join(project_root, "data", "Airbnb_DK_Processed_Data.csv")
df = pd.read_csv(path)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11579 entries, 0 to 11578
Data columns (total 52 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              11579 non-null  int64  
 1   name                            11579 non-null  object 
 2   description                     11353 non-null  object 
 3   host_id                         11579 non-null  int64  
 4   host_name                       11243 non-null  object 
 5   host_since                      11239 non-null  object 
 6   host_location                   9406 non-null   object 
 7   host_response_time              10120 non-null  object 
 8   host_response_rate              10120 non-null  object 
 9   host_acceptance_rate            10852 non-null  object 
 10  host_is_superhost               11490 non-null  object 
 11  host_listings_count             11239 non-null  float64
 12  host_total_listings_count       

**Decide on columns to use in A-Priori algo:**

In [4]:
cols = [
    "neighbourhood_cleansed",
    "property_type",
    "room_type",
    "host_is_superhost",
    "host_total_listings_count",
    "minimum_nights",
    "availability_365",
    "price",
    "estimated_occupancy_l365d",
    "estimated_revenue_l365d",
    "instant_bookable",
    "reviews_per_month",
    "dist_to_raadhus_km"
]
df_sub = df[cols].copy()

In [5]:
display(df_sub.head())

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,host_is_superhost,host_total_listings_count,minimum_nights,availability_365,price,estimated_occupancy_l365d,estimated_revenue_l365d,instant_bookable,reviews_per_month,dist_to_raadhus_km
0,Vesterbro-Kongens Enghave,Entire rental unit,Entire home/apt,f,1.0,7,18,900.0,0,0.0,f,0.15,1.033503
1,Vesterbro-Kongens Enghave,Entire rental unit,Entire home/apt,t,4.0,3,4,2282.0,18,41076.0,f,0.47,1.043656
2,sterbro,Entire rental unit,Entire home/apt,f,1.0,100,110,589.0,0,0.0,f,0.04,3.991337
3,Indre By,Entire condo,Entire home/apt,t,4.0,5,8,2223.0,130,288990.0,f,0.2,0.982367
4,Amager Vest,Entire condo,Entire home/apt,f,1.0,2,226,1760.0,120,211200.0,f,0.65,1.529996


**Handle missing values ["host_total_listings_count", "availability_365"] <- maybe add to preprocessing?**

In [6]:
df_sub = df_sub.dropna(subset=["host_total_listings_count", "availability_365"])
df_sub.fillna({"host_is_superhost": "f", "instant_bookable": "f"}, inplace=True)

**Create commercial label - Erstattes forhåbentlig af Fishers clustering label???**

Chatten har fundet på nedenstående som midlertidig løsning:

In [7]:
df_sub["is_commercial"] = (
    (df_sub["host_total_listings_count"] >= 2) 
)

In [8]:
# categorical_cols = [
#     "neighbourhood_cleansed",
#     "property_type",
#     "room_type",
#     "host_is_superhost",
#     "instant_bookable"
# ]

# df_encoded = pd.get_dummies(
#     df_sub,
#     columns=categorical_cols,
#     drop_first=False,   # keep ALL categories
#     dtype=int           # 0/1 integers
# )

# display(df_encoded.head())

In [9]:
# ex_cols = ["reviews_per_month", "dist_to_raadhus_km"]
# kmeans = KMeans(n_clusters=2, random_state=42)
# holder = df_encoded.drop(columns=ex_cols)
# df_sub["is_commercial"] = kmeans.fit_predict(holder)

# df_sub['is_commercial'] = df_sub['is_commercial'].map({0: 'is_commercial_False', 1: 'is_commercial_True'})
# df_sub['is_commercial'].value_counts()

Tænkte man også kunne prøve at indele det i tre kategorier: Casual, Side-hustle, Commercial-use 

Spørg de andre om det kan lade sig gøre og hvad de tænker om det

In [10]:
# Price bins (include_lowest=True to catch edge cases)
df_sub["price_bin"] = pd.cut(df_sub["price"], bins=[0, 800, 1500, df_sub["price"].max() + 1],
                             labels=["LowPrice", "MidPrice", "HighPrice"], include_lowest=True)

# Host listings count
df_sub["listings_bin"] = pd.cut(df_sub["host_total_listings_count"],
                                bins=[0, 1, 5, 101],
                                labels=["1listing", "2to5listings", "MoreThan5"], include_lowest=True)

# Availability (days bookable)
df_sub["avail_bin"] = pd.cut(df_sub["availability_365"], bins=[0, 100, 250, 366],
                             labels=["LowAvail", "MedAvail", "HighAvail"], include_lowest=True)

# Revenue (duplicates='drop' handles duplicate edges)
df_sub["revenue_bin"] = pd.qcut(df_sub["estimated_revenue_l365d"], q=3,
                                labels=["LowRevenue", "MedRevenue", "HighRevenue"], duplicates='drop')

# Occupancy
df_sub["occ_bin"] = pd.qcut(df_sub["estimated_occupancy_l365d"], q=3,
                            labels=["LowOcc", "MedOcc", "HighOcc"], duplicates='drop')

# Minimum nights
df_sub["minnights_bin"] = pd.cut(df_sub["minimum_nights"], bins=[0, 2, 6, 366],
                                 labels=["ShortStay", "MidStay", "LongStay"], include_lowest=True)

# Distance to city center
df_sub["dist_bin"] = pd.cut(df_sub["dist_to_raadhus_km"], bins=[0, 2, 5, 51],
                            labels=["Central", "MidDistance", "Outer"], include_lowest=True)

# Drop any remaining NaN values from binning
print(f"Rows before dropping NaN from bins: {len(df_sub)}")
df_sub = df_sub.dropna()
print(f"Rows after dropping NaN from bins: {len(df_sub)}")

Rows before dropping NaN from bins: 11239
Rows after dropping NaN from bins: 10969


In [11]:
df_sub = df_sub.drop(columns=['listings_bin'])


In [12]:
# Check for NaN values in df_sub
print("NaN counts in df_sub:")
print(df_sub.isnull().sum())
print(f"\nTotal rows: {len(df_sub)}")
print(f"Rows with any NaN: {df_sub.isnull().any(axis=1).sum()}")

# Show rows with NaN
if df_sub.isnull().any().any():
    print("\nSample rows with NaN:")
    display(df_sub[df_sub.isnull().any(axis=1)].head())

NaN counts in df_sub:
neighbourhood_cleansed       0
property_type                0
room_type                    0
host_is_superhost            0
host_total_listings_count    0
minimum_nights               0
availability_365             0
price                        0
estimated_occupancy_l365d    0
estimated_revenue_l365d      0
instant_bookable             0
reviews_per_month            0
dist_to_raadhus_km           0
is_commercial                0
price_bin                    0
avail_bin                    0
revenue_bin                  0
occ_bin                      0
minnights_bin                0
dist_bin                     0
dtype: int64

Total rows: 10969
Rows with any NaN: 0


In [13]:
df_final = df_sub[[
    "neighbourhood_cleansed",
    "property_type",
    "room_type",
    "host_is_superhost",
    "instant_bookable",
    "price_bin",
    # "listings_bin",
    "avail_bin",
    "revenue_bin",
    "occ_bin",
    "minnights_bin",
    "dist_bin",
    "is_commercial"
]]
df_final["host_is_superhost"] = df_final["host_is_superhost"].map({"t": "superhost_True", "f": "superhost_False"})
df_final["instant_bookable"] = df_final["instant_bookable"].map({"t": "instant_bookable_True", "f": "instant_bookable_False"})
df_final["is_commercial"] = df_final["is_commercial"].map({True: "is_commercial_True", False: "is_commercial_False"})

display(df_final.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["host_is_superhost"] = df_final["host_is_superhost"].map({"t": "superhost_True", "f": "superhost_False"})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["instant_bookable"] = df_final["instant_bookable"].map({"t": "instant_bookable_True", "f": "instant_bookable_False"})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,host_is_superhost,instant_bookable,price_bin,avail_bin,revenue_bin,occ_bin,minnights_bin,dist_bin,is_commercial
0,Vesterbro-Kongens Enghave,Entire rental unit,Entire home/apt,superhost_False,instant_bookable_False,MidPrice,LowAvail,LowRevenue,LowOcc,LongStay,Central,is_commercial_False
1,Vesterbro-Kongens Enghave,Entire rental unit,Entire home/apt,superhost_True,instant_bookable_False,HighPrice,LowAvail,MedRevenue,MedOcc,MidStay,Central,is_commercial_True
2,sterbro,Entire rental unit,Entire home/apt,superhost_False,instant_bookable_False,LowPrice,MedAvail,LowRevenue,LowOcc,LongStay,MidDistance,is_commercial_False
3,Indre By,Entire condo,Entire home/apt,superhost_True,instant_bookable_False,HighPrice,LowAvail,HighRevenue,HighOcc,MidStay,Central,is_commercial_True
4,Amager Vest,Entire condo,Entire home/apt,superhost_False,instant_bookable_False,HighPrice,MedAvail,HighRevenue,HighOcc,ShortStay,Central,is_commercial_False


**Converting into transaction format**

In [14]:
df_final = df_final.astype(str)
df_final = df_final.apply(lambda x: x.str.replace(" ", ""))  # clean spaces

In [15]:


transactions = df_final.values.tolist()
encoder = TransactionEncoder()
te_ary = encoder.fit(transactions).transform(transactions)
df_apriori = pd.DataFrame(te_ary, columns=encoder.columns_)


In [16]:
# Check what columns we have after encoding
print("Columns in df_apriori:")
print(df_apriori.columns.tolist())
print(f"\nDataFrame shape: {df_apriori.shape}")
print(f"\nSample of encoded data:")
display(df_apriori.head())

Columns in df_apriori:
['AmagerVest', 'Amagerst', 'Bispebjerg', 'Boat', 'Brnshj-Husum', 'Camper/RV', 'Casaparticular', 'Central', 'Entirebungalow', 'Entirecabin', 'Entirecondo', 'Entireguesthouse', 'Entireguestsuite', 'Entirehome', 'Entirehome/apt', 'Entireloft', 'Entireplace', 'Entirerentalunit', 'Entireservicedapartment', 'Entiretownhouse', 'Entirevacationhome', 'Entirevilla', 'Frederiksberg', 'HighAvail', 'HighOcc', 'HighPrice', 'HighRevenue', 'Hotelroom', 'Houseboat', 'Hut', 'IndreBy', 'LongStay', 'LowAvail', 'LowOcc', 'LowPrice', 'LowRevenue', 'MedAvail', 'MedOcc', 'MedRevenue', 'MidDistance', 'MidPrice', 'MidStay', 'Nrrebro', 'Outer', 'Privateroom', 'Privateroominbarn', 'Privateroominbedandbreakfast', 'Privateroominboat', 'Privateroominbungalow', 'Privateroomincabin', 'Privateroomincasaparticular', 'Privateroomincondo', 'Privateroominguesthouse', 'Privateroominguestsuite', 'Privateroominhome', 'Privateroominhostel', 'Privateroominloft', 'Privateroominrentalunit', 'Privateroominse

Unnamed: 0,AmagerVest,Amagerst,Bispebjerg,Boat,Brnshj-Husum,Camper/RV,Casaparticular,Central,Entirebungalow,Entirecabin,...,Valby,Vanlse,Vesterbro-KongensEnghave,instant_bookable_False,instant_bookable_True,is_commercial_False,is_commercial_True,sterbro,superhost_False,superhost_True
0,False,False,False,False,False,False,False,True,False,False,...,False,False,True,True,False,True,False,False,True,False
1,False,False,False,False,False,False,False,True,False,False,...,False,False,True,True,False,False,True,False,False,True
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,True,False,True,True,False
3,False,False,False,False,False,False,False,True,False,False,...,False,False,False,True,False,False,True,False,False,True
4,True,False,False,False,False,False,False,True,False,False,...,False,False,False,True,False,True,False,False,True,False


In [17]:
# Run Apriori
frequent_itemsets = apriori(df_apriori, min_support=0.05, use_colnames=True)
print(f"Found {len(frequent_itemsets)} frequent itemsets")

# Generate rules with lift > 1 (positive association)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=3.0)
print(f"Found {len(rules)} rules")

if len(rules) > 0:
    # Show top rules
    print("\nTop 10 rules by lift:")
    display(rules.sort_values("lift", ascending=False)[["antecedents", "consequents", "support", "confidence", "lift"]].head(10))
    
    # Show rules predicting commercial
    commercial_rules = rules[rules["consequents"].astype(str).str.contains("is_commercial_True")]
    print(f"\nRules predicting commercial: {len(commercial_rules)}")
    if len(commercial_rules) > 0:
        display(commercial_rules.sort_values("confidence", ascending=False)[["antecedents", "support", "consequents", "confidence", "lift"]].head(10))
    else:
        print("No rules found predicting commercial. Try lowering min_threshold or min_support.")
else:
    print("No rules found! Try lowering min_support or min_threshold.")

Found 6899 frequent itemsets
Found 6322 rules

Top 10 rules by lift:


Unnamed: 0,antecedents,consequents,support,confidence,lift
2,(Privateroominrentalunit),(Privateroom),0.051144,1.0,11.261807
3,(Privateroom),(Privateroominrentalunit),0.051144,0.575975,11.261807
239,"(HighRevenue, Central)","(Entirehome/apt, HighOcc, IndreBy)",0.056888,0.340611,5.55151
236,"(Entirehome/apt, HighOcc, IndreBy)","(HighRevenue, Central)",0.056888,0.927192,5.55151
233,"(Entirehome/apt, HighOcc, Central)","(IndreBy, HighRevenue)",0.056888,0.476336,5.453996
242,"(IndreBy, HighRevenue)","(Entirehome/apt, HighOcc, Central)",0.056888,0.651357,5.453996
45,"(HighRevenue, Central)","(IndreBy, HighOcc)",0.061719,0.369541,5.419118
42,"(IndreBy, HighOcc)","(HighRevenue, Central)",0.061719,0.90508,5.419118
234,"(Entirehome/apt, HighRevenue, Central)","(IndreBy, HighOcc)",0.056888,0.367491,5.389052
241,"(IndreBy, HighOcc)","(Entirehome/apt, HighRevenue, Central)",0.056888,0.834225,5.389052



Rules predicting commercial: 36


Unnamed: 0,antecedents,support,consequents,confidence,lift
114,"(HighPrice, HighOcc)",0.052876,"(is_commercial_True, HighRevenue)",0.606061,3.753743
431,"(Entirehome/apt, HighPrice, HighOcc)",0.052329,"(is_commercial_True, HighRevenue)",0.603575,3.738349
435,"(HighPrice, HighOcc)",0.052329,"(Entirehome/apt, is_commercial_True, HighRevenue)",0.599791,4.23366
63,"(IndreBy, HighPrice)",0.051144,"(is_commercial_True, Central)",0.539942,3.018668
47,"(HighOcc, Central)",0.069286,"(is_commercial_True, HighRevenue)",0.512129,3.171963
248,"(Entirehome/apt, HighOcc, Central)",0.060717,"(is_commercial_True, HighRevenue)",0.508397,3.148846
166,"(HighRevenue, superhost_True)",0.059714,"(is_commercial_True, HighOcc)",0.459327,3.108177
149,"(ShortStay, HighRevenue)",0.06646,"(is_commercial_True, HighOcc)",0.453923,3.07161
251,"(HighOcc, Central)",0.060717,"(Entirehome/apt, is_commercial_True, HighRevenue)",0.448787,3.16779
366,"(HighOcc, Entirerentalunit)",0.06646,"(Entirehome/apt, is_commercial_True, HighRevenue)",0.426067,3.007416


# The following extracts the data from the relevant item sets given an index from the table above

In [18]:
# --- Function to extract rule features and generate subsets ---
def get_rule_subset(df, commercial_rules, rule_index):
    """
    Given a DataFrame, a rules DataFrame, and a rule index (sorted by confidence),
    this function extracts the rule's antecedents and consequents, and creates
    subsets of the original df corresponding to each feature in the antecedents.
    
    Returns:
        subset_dict: dictionary of feature_name -> subset of df
        rule_info: tuple (antecedents, consequents)
    """
    # Select the rule by index (sorted by confidence)
    rule = commercial_rules.sort_values("confidence", ascending=False).iloc[rule_index]
    antecedents = list(rule["antecedents"])
    consequents = list(rule["consequents"])
    
    subset_dict = {}
    
    # Map known bin names to filtering conditions
    for feature in antecedents:
        # Example: price bins
        if "HighPrice" in str(feature):
            subset_dict["HighPrice"] = df[df["price"] > 1500].copy()
        elif "MidPrice" in str(feature):
            subset_dict["MidPrice"] = df[(df["price"] > 800) & (df["price"] <= 1500)].copy()
        elif "LowPrice" in str(feature):
            subset_dict["LowPrice"] = df[df["price"] <= 800].copy()

        # Host listings
        elif "1listing" in str(feature):
            subset_dict["1listing"] = df[df["host_total_listings_count"] <= 1].copy()
        elif "2to5listings" in str(feature):
            subset_dict["2to5listings"] = df[(df["host_total_listings_count"] > 1) & (df["host_total_listings_count"] <= 5)].copy()
        elif "MoreThan5" in str(feature):
            subset_dict["MoreThan5"] = df[df["host_total_listings_count"] > 5].copy()

        # Availability
        elif "LowAvail" in str(feature):
            subset_dict["LowAvail"] = df[df["availability_365"] <= 100].copy()
        elif "MedAvail" in str(feature):
            subset_dict["MedAvail"] = df[(df["availability_365"] > 100) & (df["availability_365"] <= 250)].copy()
        elif "HighAvail" in str(feature):
            subset_dict["HighAvail"] = df[df["availability_365"] > 250].copy()

        # Revenue
        elif "LowRevenue" in str(feature):
            q1 = df["estimated_revenue_l365d"].quantile(1/3)
            subset_dict["LowRevenue"] = df[df["estimated_revenue_l365d"] <= q1].copy()
        elif "MedRevenue" in str(feature):
            q1, q2 = df["estimated_revenue_l365d"].quantile([1/3, 2/3])
            subset_dict["MedRevenue"] = df[(df["estimated_revenue_l365d"] > q1) & (df["estimated_revenue_l365d"] <= q2)].copy()
        elif "HighRevenue" in str(feature):
            q2 = df["estimated_revenue_l365d"].quantile(2/3)
            subset_dict["HighRevenue"] = df[df["estimated_revenue_l365d"] > q2].copy()

        # Occupancy
        elif "LowOcc" in str(feature):
            q1 = df["estimated_occupancy_l365d"].quantile(1/3)
            subset_dict["LowOcc"] = df[df["estimated_occupancy_l365d"] <= q1].copy()
        elif "MedOcc" in str(feature):
            q1, q2 = df["estimated_occupancy_l365d"].quantile([1/3, 2/3])
            subset_dict["MedOcc"] = df[(df["estimated_occupancy_l365d"] > q1) & (df["estimated_occupancy_l365d"] <= q2)].copy()
        elif "HighOcc" in str(feature):
            q2 = df["estimated_occupancy_l365d"].quantile(2/3)
            subset_dict["HighOcc"] = df[df["estimated_occupancy_l365d"] > q2].copy()

        # Minimum nights
        elif "ShortStay" in str(feature):
            subset_dict["ShortStay"] = df[df["minimum_nights"] <= 2].copy()
        elif "MidStay" in str(feature):
            subset_dict["MidStay"] = df[(df["minimum_nights"] > 2) & (df["minimum_nights"] <= 6)].copy()
        elif "LongStay" in str(feature):
            subset_dict["LongStay"] = df[df["minimum_nights"] > 6].copy()

        # Distance to city center
        elif "Central" in str(feature):
            subset_dict["Central"] = df[df["dist_to_raadhus_km"] <= 2].copy()
        elif "MidDistance" in str(feature):
            subset_dict["MidDistance"] = df[(df["dist_to_raadhus_km"] > 2) & (df["dist_to_raadhus_km"] <= 5)].copy()
        elif "Outer" in str(feature):
            subset_dict["Outer"] = df[df["dist_to_raadhus_km"] > 5].copy()
        
        else:
            print(f"Warning: feature '{feature}' not recognized")

    return subset_dict, (antecedents, consequents)

rule_index = 2
subsets, (antecedents, consequents) = get_rule_subset(df, commercial_rules, rule_index)
print("Antecedents:", antecedents)
print("Consequents:", consequents)



Antecedents: ['HighPrice', 'HighOcc']
Consequents: ['Entirehome/apt', 'is_commercial_True', 'HighRevenue']
