In [42]:
import pandas as pd
import numpy as np
import folium
import seaborn as sns
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
import os

In [43]:
cwd = os.getcwd()
if os.path.basename(cwd) == "Notebooks":
    project_root = os.path.dirname(cwd)
else:
    project_root = cwd
path = os.path.join(project_root, "data", "Airbnb_DK_Processed_Data.csv")
df = pd.read_csv(path)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11579 entries, 0 to 11578
Data columns (total 52 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              11579 non-null  int64  
 1   name                            11579 non-null  object 
 2   description                     11353 non-null  object 
 3   host_id                         11579 non-null  int64  
 4   host_name                       11243 non-null  object 
 5   host_since                      11239 non-null  object 
 6   host_location                   9406 non-null   object 
 7   host_response_time              10120 non-null  object 
 8   host_response_rate              10120 non-null  object 
 9   host_acceptance_rate            10852 non-null  object 
 10  host_is_superhost               11490 non-null  object 
 11  host_listings_count             11239 non-null  float64
 12  host_total_listings_count       

**Decide on columns to use in A-Priori algo:**

In [45]:
cols = [
    "neighbourhood_cleansed",
    "property_type",
    "room_type",
    "host_is_superhost",
    "host_total_listings_count",
    "minimum_nights",
    "availability_365",
    "price",
    "estimated_occupancy_l365d",
    "estimated_revenue_l365d",
    "instant_bookable",
    "reviews_per_month",
    "dist_to_raadhus_km"
]
df_sub = df[cols].copy()

In [46]:
display(df_sub.head())

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,host_is_superhost,host_total_listings_count,minimum_nights,availability_365,price,estimated_occupancy_l365d,estimated_revenue_l365d,instant_bookable,reviews_per_month,dist_to_raadhus_km
0,Vesterbro-Kongens Enghave,Entire rental unit,Entire home/apt,f,1.0,7,18,900.0,0,0.0,f,0.15,1.033503
1,Vesterbro-Kongens Enghave,Entire rental unit,Entire home/apt,t,4.0,3,4,2282.0,18,41076.0,f,0.47,1.043656
2,sterbro,Entire rental unit,Entire home/apt,f,1.0,100,110,589.0,0,0.0,f,0.04,3.991337
3,Indre By,Entire condo,Entire home/apt,t,4.0,5,8,2223.0,130,288990.0,f,0.2,0.982367
4,Amager Vest,Entire condo,Entire home/apt,f,1.0,2,226,1760.0,120,211200.0,f,0.65,1.529996


**Handle missing values ["host_total_listings_count", "availability_365"] <- maybe add to preprocessing?**

In [47]:
df_sub = df_sub.dropna(subset=["host_total_listings_count", "availability_365"])
df_sub.fillna({"host_is_superhost": "f", "instant_bookable": "f"}, inplace=True)

**Create commercial label - Erstattes forhåbentlig af Fishers clustering label???**

Chatten har fundet på nedenstående som midlertidig løsning:

In [48]:
df_sub["is_commercial"] = (
    (df_sub["host_total_listings_count"] >= 5) |
    ((df_sub["room_type"] == "Entire home/apt") & (df_sub["minimum_nights"] >= 7)) |
    (df_sub["estimated_revenue_l365d"] > df_sub["estimated_revenue_l365d"].quantile(0.8))
)

Tænkte man også kunne prøve at indele det i tre kategorier: Casual, Side-hustle, Commercial-use 

Spørg de andre om det kan lade sig gøre og hvad de tænker om det

In [49]:
# Price bins (include_lowest=True to catch edge cases)
df_sub["price_bin"] = pd.cut(df_sub["price"], bins=[0, 800, 1500, df_sub["price"].max() + 1],
                             labels=["LowPrice", "MidPrice", "HighPrice"], include_lowest=True)

# Host listings count
df_sub["listings_bin"] = pd.cut(df_sub["host_total_listings_count"],
                                bins=[0, 1, 5, 101],
                                labels=["1listing", "2to5listings", "MoreThan5"], include_lowest=True)

# Availability (days bookable)
df_sub["avail_bin"] = pd.cut(df_sub["availability_365"], bins=[0, 100, 250, 366],
                             labels=["LowAvail", "MedAvail", "HighAvail"], include_lowest=True)

# Revenue (duplicates='drop' handles duplicate edges)
df_sub["revenue_bin"] = pd.qcut(df_sub["estimated_revenue_l365d"], q=3,
                                labels=["LowRevenue", "MedRevenue", "HighRevenue"], duplicates='drop')

# Occupancy
df_sub["occ_bin"] = pd.qcut(df_sub["estimated_occupancy_l365d"], q=3,
                            labels=["LowOcc", "MedOcc", "HighOcc"], duplicates='drop')

# Minimum nights
df_sub["minnights_bin"] = pd.cut(df_sub["minimum_nights"], bins=[0, 2, 6, 366],
                                 labels=["ShortStay", "MidStay", "LongStay"], include_lowest=True)

# Distance to city center
df_sub["dist_bin"] = pd.cut(df_sub["dist_to_raadhus_km"], bins=[0, 2, 5, 51],
                            labels=["Central", "MidDistance", "Outer"], include_lowest=True)

# Drop any remaining NaN values from binning
print(f"Rows before dropping NaN from bins: {len(df_sub)}")
df_sub = df_sub.dropna()
print(f"Rows after dropping NaN from bins: {len(df_sub)}")

Rows before dropping NaN from bins: 11239
Rows after dropping NaN from bins: 10969


In [50]:
# Check for NaN values in df_sub
print("NaN counts in df_sub:")
print(df_sub.isnull().sum())
print(f"\nTotal rows: {len(df_sub)}")
print(f"Rows with any NaN: {df_sub.isnull().any(axis=1).sum()}")

# Show rows with NaN
if df_sub.isnull().any().any():
    print("\nSample rows with NaN:")
    display(df_sub[df_sub.isnull().any(axis=1)].head())

NaN counts in df_sub:
neighbourhood_cleansed       0
property_type                0
room_type                    0
host_is_superhost            0
host_total_listings_count    0
minimum_nights               0
availability_365             0
price                        0
estimated_occupancy_l365d    0
estimated_revenue_l365d      0
instant_bookable             0
reviews_per_month            0
dist_to_raadhus_km           0
is_commercial                0
price_bin                    0
listings_bin                 0
avail_bin                    0
revenue_bin                  0
occ_bin                      0
minnights_bin                0
dist_bin                     0
dtype: int64

Total rows: 10969
Rows with any NaN: 0


In [51]:
df_final = df_sub[[
    "neighbourhood_cleansed",
    "property_type",
    "room_type",
    "host_is_superhost",
    "instant_bookable",
    "price_bin",
    "listings_bin",
    "avail_bin",
    "revenue_bin",
    "occ_bin",
    "minnights_bin",
    "dist_bin",
    "is_commercial"
]]
df_final["host_is_superhost"] = df_final["host_is_superhost"].map({"t": "superhost_True", "f": "superhost_False"})
df_final["instant_bookable"] = df_final["instant_bookable"].map({"t": "instant_bookable_True", "f": "instant_bookable_False"})
df_final["is_commercial"] = df_final["is_commercial"].map({True: "is_commercial_True", False: "is_commercial_False"})

display(df_final.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["host_is_superhost"] = df_final["host_is_superhost"].map({"t": "superhost_True", "f": "superhost_False"})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final["instant_bookable"] = df_final["instant_bookable"].map({"t": "instant_bookable_True", "f": "instant_bookable_False"})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,host_is_superhost,instant_bookable,price_bin,listings_bin,avail_bin,revenue_bin,occ_bin,minnights_bin,dist_bin,is_commercial
0,Vesterbro-Kongens Enghave,Entire rental unit,Entire home/apt,superhost_False,instant_bookable_False,MidPrice,1listing,LowAvail,LowRevenue,LowOcc,LongStay,Central,is_commercial_True
1,Vesterbro-Kongens Enghave,Entire rental unit,Entire home/apt,superhost_True,instant_bookable_False,HighPrice,2to5listings,LowAvail,MedRevenue,MedOcc,MidStay,Central,is_commercial_False
2,sterbro,Entire rental unit,Entire home/apt,superhost_False,instant_bookable_False,LowPrice,1listing,MedAvail,LowRevenue,LowOcc,LongStay,MidDistance,is_commercial_True
3,Indre By,Entire condo,Entire home/apt,superhost_True,instant_bookable_False,HighPrice,2to5listings,LowAvail,HighRevenue,HighOcc,MidStay,Central,is_commercial_True
4,Amager Vest,Entire condo,Entire home/apt,superhost_False,instant_bookable_False,HighPrice,1listing,MedAvail,HighRevenue,HighOcc,ShortStay,Central,is_commercial_True


**Converting into transaction format**

In [52]:
df_final = df_final.astype(str)
df_final = df_final.apply(lambda x: x.str.replace(" ", ""))  # clean spaces

In [53]:


transactions = df_final.values.tolist()
encoder = TransactionEncoder()
te_ary = encoder.fit(transactions).transform(transactions)
df_apriori = pd.DataFrame(te_ary, columns=encoder.columns_)


In [54]:
# Check what columns we have after encoding
print("Columns in df_apriori:")
print(df_apriori.columns.tolist())
print(f"\nDataFrame shape: {df_apriori.shape}")
print(f"\nSample of encoded data:")
display(df_apriori.head())

Columns in df_apriori:
['1listing', '2to5listings', 'AmagerVest', 'Amagerst', 'Bispebjerg', 'Boat', 'Brnshj-Husum', 'Camper/RV', 'Casaparticular', 'Central', 'Entirebungalow', 'Entirecabin', 'Entirecondo', 'Entireguesthouse', 'Entireguestsuite', 'Entirehome', 'Entirehome/apt', 'Entireloft', 'Entireplace', 'Entirerentalunit', 'Entireservicedapartment', 'Entiretownhouse', 'Entirevacationhome', 'Entirevilla', 'Frederiksberg', 'HighAvail', 'HighOcc', 'HighPrice', 'HighRevenue', 'Hotelroom', 'Houseboat', 'Hut', 'IndreBy', 'LongStay', 'LowAvail', 'LowOcc', 'LowPrice', 'LowRevenue', 'MedAvail', 'MedOcc', 'MedRevenue', 'MidDistance', 'MidPrice', 'MidStay', 'MoreThan5', 'Nrrebro', 'Outer', 'Privateroom', 'Privateroominbarn', 'Privateroominbedandbreakfast', 'Privateroominboat', 'Privateroominbungalow', 'Privateroomincabin', 'Privateroomincasaparticular', 'Privateroomincondo', 'Privateroominguesthouse', 'Privateroominguestsuite', 'Privateroominhome', 'Privateroominhostel', 'Privateroominloft', 'P

Unnamed: 0,1listing,2to5listings,AmagerVest,Amagerst,Bispebjerg,Boat,Brnshj-Husum,Camper/RV,Casaparticular,Central,...,Valby,Vanlse,Vesterbro-KongensEnghave,instant_bookable_False,instant_bookable_True,is_commercial_False,is_commercial_True,sterbro,superhost_False,superhost_True
0,True,False,False,False,False,False,False,False,False,True,...,False,False,True,True,False,False,True,False,True,False
1,False,True,False,False,False,False,False,False,False,True,...,False,False,True,True,False,True,False,False,False,True
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,True,True,True,False
3,False,True,False,False,False,False,False,False,False,True,...,False,False,False,True,False,False,True,False,False,True
4,True,False,True,False,False,False,False,False,False,True,...,False,False,False,True,False,False,True,False,True,False


In [57]:
# Run Apriori
frequent_itemsets = apriori(df_apriori, min_support=0.05, use_colnames=True)
print(f"Found {len(frequent_itemsets)} frequent itemsets")

# Generate rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)
print(f"Found {len(rules)} rules")

# Show top rules
print("\nTop 10 rules by lift:")
display(rules.sort_values("lift", ascending=False)[["antecedents", "consequents", "support", "confidence", "lift"]].head(10))

# Show rules predicting commercial
commercial_rules = rules[rules["consequents"].astype(str).str.contains("is_commercial_True")]
print(f"\nRules predicting commercial: {len(commercial_rules)}")
if len(commercial_rules) > 0:
    display(commercial_rules.sort_values("lift", ascending=False)[["antecedents", "consequents", "confidence", "lift"]].head(10))

Found 12043 frequent itemsets
Found 322404 rules

Top 10 rules by lift:
Found 322404 rules

Top 10 rules by lift:


Unnamed: 0,antecedents,consequents,support,confidence,lift
81,(Privateroominrentalunit),(Privateroom),0.051144,1.0,11.261807
80,(Privateroom),(Privateroominrentalunit),0.051144,0.575975,11.261807
88312,"(Central, HighRevenue, is_commercial_True)","(HighOcc, IndreBy, Entirehome/apt)",0.052694,0.447368,7.291507
88303,"(HighOcc, IndreBy, Entirehome/apt)","(Central, HighRevenue, is_commercial_True)",0.052694,0.858841,7.291507
88321,"(HighOcc, IndreBy)","(Central, HighRevenue, is_commercial_True, Ent...",0.052694,0.772727,7.081074
88294,"(Central, HighRevenue, is_commercial_True, Ent...","(HighOcc, IndreBy)",0.052694,0.482874,7.081074
27618,"(Central, HighRevenue, is_commercial_True)","(HighOcc, IndreBy)",0.056614,0.48065,7.048465
27623,"(HighOcc, IndreBy)","(Central, HighRevenue, is_commercial_True)",0.056614,0.830214,7.048465
88297,"(Central, HighOcc, is_commercial_True, Entireh...","(HighRevenue, IndreBy)",0.052694,0.558454,6.394241
88318,"(HighRevenue, IndreBy)","(Central, HighOcc, is_commercial_True, Entireh...",0.052694,0.60334,6.394241



Rules predicting commercial: 2106


Unnamed: 0,antecedents,consequents,confidence,lift
88303,"(HighOcc, IndreBy, Entirehome/apt)","(Central, HighRevenue, is_commercial_True)",0.858841,7.291507
88321,"(HighOcc, IndreBy)","(Central, HighRevenue, is_commercial_True, Ent...",0.772727,7.081074
27623,"(HighOcc, IndreBy)","(Central, HighRevenue, is_commercial_True)",0.830214,7.048465
88318,"(HighRevenue, IndreBy)","(Central, HighOcc, is_commercial_True, Entireh...",0.60334,6.394241
88314,"(Central, HighOcc, Entirehome/apt)","(HighRevenue, is_commercial_True, IndreBy)",0.441221,6.293572
27621,"(HighRevenue, IndreBy)","(Central, HighOcc, is_commercial_True)",0.648225,6.209943
88298,"(HighRevenue, IndreBy, Entirehome/apt)","(Central, HighOcc, is_commercial_True)",0.642222,6.152433
88300,"(HighRevenue, HighOcc, IndreBy)","(Central, is_commercial_True, Entirehome/apt)",0.803894,6.136337
88284,"(HighRevenue, HighOcc, IndreBy, Entirehome/apt)","(Central, is_commercial_True)",0.870482,6.043238
88265,"(HighOcc, HighPrice)","(Central, HighRevenue, is_commercial_True, Ent...",0.655172,6.003831
