# IMPORTING ALL DEPENDANCIES I NEED FOR THIS PROJECT

In [None]:
import numpy as np
import pandas as pd
import missingno as msno 
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import statsmodels.graphics.correlation as sgc
from statsmodels.graphics.gofplots import qqplot
import statsmodels.stats.api as sms
from statsmodels.stats.outliers_influence import OLSInfluence
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# CONNECTION TO MY DATABASE ON POSTGRES

This is the connection link to my database on postgreSQL, the actual connection function is on the file **db_connect.py**

In [None]:
# Import necessary packages
import pandas as pd
import sys
sys.path.append('..')  # Go up one level to the root folder
from db_connect import connect_to_db

# Step 1: Connect to the database
conn = connect_to_db()

# Step 2: Create a cursor and run a query
cursor = conn.cursor()
query = "SELECT * FROM airbnbs_nairobi.listing_data_yearly;"
cursor.execute(query)

# Step 3: Fetch results and convert to a DataFrame
rows = cursor.fetchall()
df = pd.DataFrame(rows, columns=[desc[0] for desc in cursor.description])

# Step 4: Display the data
print("Connection successful! Previewing data:")
display(df.head())

In [None]:
df

# Data Exploration an attempt at understanding my data

In [None]:
df.info()

In [None]:
df.describe()

# Data Cleaning

In [None]:
import re

def to_snake_case(name):
    # Convert to lowercase
    name = name.lower()
    # Replace spaces with underscores
    name = name.replace(' ', '_')
    # Remove special characters like parentheses
    name = re.sub(r'[(%\)]+', '', name)
    # Replace multiple underscores with single underscore
    name = re.sub(r'_+', '_', name)
    return name

# Apply to all columns
df.columns = [to_snake_case(col) for col in df.columns]

print(df.columns)

In [None]:
msno.matrix(df)

# Pricing & Revenue Analysis

## What's the correlation between listing type and average nightly rate?
For this analysis, room type is used as the primary classification variable, as it provides a more general and consistent categorization of listings. In contrast, listing type contains a large number of highly specific categories, which can introduce unnecessary complexity and reduce comparability across observations.

In [None]:
listing_type = df['listing_type'].unique()
len(listing_type)

In [None]:
room_type = df['room_type'].unique()
len(room_type)
room_type

While **listing types** consist of **29 unique categories** across the dataset, **room types** are limited to **three broad and representative categories** that better capture the nature of the accommodation. These include **Private Room**, **Entire Home/Apartment**, and **Hotel Room**, the latter of which appears only once in the dataset.

#### Distribution of Room Type across the entire dataset

In [None]:
room_type_count = df['room_type'].value_counts()
plt.figure(figsize=(10,8))
plt.pie(room_type_count, labels=room_type_count.index, colors=("#0FB9B9", "#E29015", "#F3013E"), autopct='%1.1f%%', startangle=90, textprops={'fontsize':11})
plt.title("Room Type Distribution Across the Dataset", fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
listing_type_vs_avg_rate_per_night = df.groupby('room_type')['avg_rate_per_year'].mean().sort_values(ascending=False).round(2)
plt.figure(figsize=(10, 6))
listing_type_vs_avg_rate_per_night.plot(kind='bar', color=["#1AF64D", "#10EFCD", "#F60F0FF3"])
plt.title('Listing Type correlation to Average Rate Per Night', fontsize=14, fontweight='bold')
plt.ylabel('Average rate per Night', fontsize=14)
plt.xlabel('Type of Listing', fontsize=14)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

# display the findings
listing_type_vs_avg_rate_per_night

Based on the analysis, **hotel rooms** command the highest average nightly rate at **KSh 15,401**. In contrast, **entire homes** and **private rooms** are priced at relatively similar levels, with average nightly rates of **KSh 7,488** and **KSh 7,424**, respectively.

## Which room types generate the most revenue despite having lower average rates?

In [None]:
avg_rate_vs_revenue_per_room_type = df.groupby('room_type')['revenue_per_year'].mean().sort_values(ascending=False).round(2)
avg_rate_vs_revenue_per_room_type

In [None]:
avg_rate_vs_revenue_per_room_type = (
    df.groupby('room_type')
      .agg(
          avg_rate_per_night=('avg_rate_per_year', 'mean'),
          revenue_per_year=('revenue_per_year', 'mean')
      )
).sort_values(by='revenue_per_year', ascending=False).round(2)

avg_rate_vs_revenue_per_room_type

The listing type generating the most revenue is **Entire Home, Ksh 619762** despite being the second highest rate per night, followed by **Private Room** generating **Ksh 229331** per year and lastly **Hotel Rooms** generating **Ksh 168583** despite being the highest rate per night

# Host Strategy & Management

## Are professionally managed listings priced higher than individually managed ones?

A listing is considered **professionally managed** when it is:

*   Operated by a **property management company** or hospitality firm
    
*   Managed by hosts who handle **multiple properties** as a business rather than a single personal residence
    

This typically includes:

*   Standardized check-in/check-out procedures
    
*   Dedicated cleaning and maintenance teams
    
*   Consistent pricing and availability management
    
*   Formal guest communication and support systems
    

How this differs from individual hosts

*   **Individually managed listings** are usually run by:
    
    *   A single host
        
    *   Often the property owner
        
    *   With more personalized and less standardized operations
        
*   **Professionally managed listings** tend to:
    
    *   Have **stricter policies** (e.g., cancellation, minimum nights)
        
    *   Operate more like hotels or serviced apartments
        
    *   Prioritize occupancy optimization and operational efficiency

In [None]:
# Map boolean values to labels
management_labels = df['professional_management'].map({
    True: 'Professionally Managed',
    False: 'Individually Managed'
})

professional_management_pie = management_labels.value_counts()

colors = ("#5da718", "#79059c")

plt.figure(figsize=(10, 8))
plt.pie(
    professional_management_pie,
    labels=professional_management_pie.index,
    autopct='%1.1f%%',
    colors=colors,
    startangle=90,
    textprops={'fontsize': 11}
)
plt.title("Professionally Managed vs Individually Managed Listings")
plt.tight_layout()
plt.show()

# Display counts
print(professional_management_pie)


so 99 Listings making up 33% of all the listings are professionally managed while 201 Listings making uo 67% of all the listings are Individually managed

In [None]:
management_labels = df['professional_management'].map({
    True: 'Professionally Managed',
    False: 'Individually Managed'
})

pricing_vs_avg_rate_per_year = df.groupby(management_labels)['avg_rate_per_year'].mean().round(2)
pricing_vs_avg_rate_per_year

In [None]:
plt.figure(figsize=(10,6))
pricing_vs_avg_rate_per_year.plot(kind='bar', color=["#05258D","#067538"])
plt.title('Professional Management Correlation To Average Price', fontsize=12, fontweight='bold')
plt.xlabel('Professional Management', fontsize=12)
plt.xticks(rotation=0)
plt.ylabel('Average Rate Per Year', fontsize=12)
plt.tight_layout()
plt.show()


based on our chart above, the professionally managed listings tend to be priced higher compared to individually run listings, With professionally managed listings charging an average a rate of **8086 per Night** and Individually Managed Listings charging an average rate of **7219 per Night**

## Do professionally managed listings have better reviews/ratings?

In [None]:
professionally_managed_vs_reviews = df.groupby(management_labels)['rating_overall'].mean().sort_values(ascending=True).round(2)
professionally_managed_vs_reviews

**Individually Managed** Listings are slightly better rated at **4.79** compared to **Professionally Managed** listings at **4.77**

## Is there a relationship between professional management and cancellation strictness?

To assess whether host management strategy influences booking flexibility, we examined the distribution of cancellation policies across professionally and individually managed listings.

In [None]:
professional_management_vs_cancellation_strictness = df.groupby('cancellation_policy')['professional_management'].value_counts
professional_management_vs_cancellation_strictness

In [None]:
pm_vs_policy = (
    pd.crosstab(
        df['cancellation_policy'],
        df['professional_management'].map({
            True: 'Professionally Managed',
            False: 'Individually Managed'
        })
    )
)


In [None]:
pm_vs_policy.plot(
    kind='bar',
    stacked=True,
    figsize=(10, 6)
)

plt.title('Professional Management vs Cancellation Policy')
plt.xlabel('Cancellation Policy')
plt.ylabel('Number of Listings')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


### Key Insights: Professional Management vs Cancellation Policy

1.  **Flexible policies are dominated by individually managed listings**The majority of listings with _Flexible_ cancellation policies are individually managed, suggesting that solo hosts prioritize flexibility to attract bookings and remain competitive.
    
2.  **Professionally managed listings are more concentrated in stricter policies**Under _Strict_ and _Moderate_ cancellation policies, professionally managed listings make up a noticeably larger share. This indicates that professional operators are more comfortable enforcing stricter terms, likely due to better demand forecasting, operational buffers, and portfolio diversification.
    
3.  **Moderate policies represent a middle ground for both host types**Both individual and professional hosts are strongly represented under _Moderate_ policies, reinforcing the idea that this policy balances guest appeal with revenue protection.
    
4.  **Firm and Limited policies are relatively rare**Very few listings adopt _Firm_ or _Limited_ cancellation policies, suggesting low market preference—possibly due to reduced guest willingness to book under highly restrictive conditions.
    

### Strategic Interpretation

Overall, **professional management is associated with stricter cancellation strategies**, while **individual hosts rely more on flexibility to drive demand**. This highlights differing risk tolerance and pricing strategies between professional operators and independent hosts in Nairobi’s Airbnb market.

If you want, I can now help you **connect this insight directly to pricing or occupancy outcomes** for a stronger narrative in your report.

## What price range attracts the most bookings (occupancy)?


In [None]:
# Create price bins
df['price_range'] = pd.cut(df['avg_rate_per_year'], 
                            bins=[0, 5000, 10000, 15000, 20000],
                            labels=['Budget (0-5k)', 'Mid-range (5-10k)', 'Premium (10-15k)', 'Luxury (15k+)'])

# Group by price range
best_price_metric = df.groupby('price_range')['annual_occupancy'].agg(['mean', 'count']).sort_values('mean', ascending=False)

plt.figure(figsize=(10, 6))
best_price_metric['mean'].plot(kind='bar', color='steelblue')
plt.title('Average Occupancy by Price Range', fontsize=16, fontweight='bold')
plt.xlabel('Price Range', fontsize=14)
plt.ylabel('Average Occupancy (%)', fontsize=14)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

print(best_price_metric)

### Insights: Average Occupancy by Price Range


1.  **Mid-range listings (KSh 5k–10k) attract the most customers**With the highest average occupancy, mid-range listings appear to offer the best balance between price and perceived value. This price segment is likely the most attractive to the majority of guests in Nairobi.
    
2.  **Luxury listings (KSh 15k+) maintain strong demand**Despite higher prices, luxury listings show relatively high occupancy, suggesting a consistent market for premium accommodation—possibly driven by business travelers, expatriates, or high-end tourists.
    
3.  **Budget listings (Below KSh 5k) do not achieve the highest occupancy**While budget listings are cheaper, their lower occupancy compared to mid-range listings suggests that **price alone is not the primary driver of demand**. Factors such as location, amenities, and perceived quality likely play a significant role.
    
4.  **Premium listings (KSh 10k–15k) show moderate occupancy**Premium listings fall between mid-range and budget listings in terms of occupancy, indicating that demand tapers slightly as prices rise beyond the mid-range threshold.
    

### Key Takeaway


> **Mid-range Airbnb listings (KSh 5k–10k) attract the highest customer demand, highlighting a value-for-money sweet spot in Nairobi’s short-term rental market.**

## How does cancellation policy affect pricing strategy?

In this segment, we check the various types of cancellation policies present within our dataset

In [None]:
plt.figure(figsize=(10, 8))
cancellation_policy_count = df['cancellation_policy'].value_counts()
colors = ['#FF6B6B', "#6F0DD0", '#45B7D1', "#EDC914", "#4EED14"]
plt.pie(cancellation_policy_count, labels=cancellation_policy_count.index, autopct='%1.1f%%', 
        colors=colors, startangle=90, textprops={'fontsize': 11})
plt.title("Cancellation Policy Distribution", fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Display the counts
print(cancellation_policy_count)

### Cancellation Policy Distribution

The dataset is dominated by **Flexible** and **Moderate** cancellation policies, with **43.7%** and **36.7%** listings respectively. This suggests that most hosts prefer policies that allow guests greater freedom to cancel reservations with minimal penalties.

**Firm** and **Strict** policies are less common, appearing in **11.3%** and **8.0%** listings respectively, indicating a smaller segment of hosts who prioritize booking certainty over flexibility.

Only **one** listing follows a **Limited** cancellation policy, making it negligible in the overall distribution.

### Key Insight

Overall, the distribution reflects a **guest-friendly marketplace**, where flexible cancellation options are more prevalent than restrictive policies, likely aimed at attracting short-term and undecided travelers.

In [None]:
valid_policies = ['Flexible', 'Moderate', 'Firm', 'Strict']
cancellation_policy_vs_pricing = df[df['cancellation_policy'].isin(valid_policies)].groupby('cancellation_policy')['avg_rate_per_year'].mean().sort_values(ascending=False).round(2)
cancellation_policy_vs_pricing

so i got rid of **Limited** since it's only one entry, i don't think it a good representation of the insights i'd like to derive from the dataset

In [None]:
plt.figure(figsize=(10,6))
cancellation_policy_vs_pricing.plot(kind='bar')
plt.title('Cancellation Policy and it\'s effect on Pricing of the Listing',fontsize=14,fontweight='bold' )
plt.ylabel('Average Rate per Night', fontsize=12)
plt.xlabel('Cancellation Policy', fontsize=12)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


### Effect of Cancellation Policy on Listing Pricing

The chart shows a **clear relationship between cancellation strictness and average nightly price**.

*   **Moderate cancellation policies** have the **highest average nightly rates**, suggesting that hosts with mid-level flexibility are able to charge a premium—possibly balancing guest trust with revenue protection.
    
*   **Strict policies** follow closely, indicating that listings enforcing tighter cancellation rules still command relatively high prices, likely due to stronger demand, desirable locations, or higher-quality listings.
    
*   **Firm policies** sit in the middle range, reflecting a moderate pricing strategy.
    
*   **Flexible cancellation policies** are associated with the **lowest average nightly rates**, suggesting that hosts may lower prices to compensate for the higher risk of last-minute cancellations.
    

###  Interpretation

Overall, **less flexible cancellation policies tend to correlate with higher pricing**, implying that hosts who restrict cancellations may do so confidently when their listings have strong market appeal. Conversely, greater flexibility appears to be used as a competitive pricing lever to attract more bookings.

#  Occupancy & Booking Patterns

## What's the average occupancy rate across the dataset?
The average occupancy rate across all active Airbnb listings in Nairobi provides a baseline measure of market demand. This metric helps contextualize how different pricing strategies, room types, and cancellation policies perform relative to the overall market.

In [None]:
average_occupancy_rate = df['annual_occupancy'].mean().round(2)
average_occupancy_rate

Average occupancy rate across the dataset is **23.39%**

## Do flexible cancellation policies lead to higher occupancy?

In [None]:
valid_policies = ['Flexible', 'Moderate', 'Firm', 'Strict']
cancellation_policy_vs_occupancy = df[df['cancellation_policy'].isin(valid_policies)].groupby('cancellation_policy')['annual_occupancy'].mean().sort_values(ascending=False).round(2)
cancellation_policy_vs_occupancy

From our Findings,Listings with  **Strict** cancellation policy holds the largest percentage of annual occupancy rate at **26.35%**, Listings with **Moderate** cancellation policy come in second with **23.61%**, Firm Cancellation policy listings hold a **23.29%** annual occupancy rate and listings with **Flexible** cancellation Policy hold the least annual occupancy percentage at **22.42%**. So to answer the question, **No** 
#### Flexible Cancellation policies do not attract higher annual occupancy

## How does minimum night requirement affect booking frequency?

In [None]:
minimum_nights_vs_booking_frequency = df.groupby('minimum_nights')['annual_occupancy'].mean().sort_values(ascending=False)
minimum_nights_vs_booking_frequency

In [None]:
plt.figure(figsize=(10,6))
minimum_nights_vs_booking_frequency.plot(kind='barh')
plt.title('Minimum Nights vs Booking Frequency', fontsize=14, fontweight='bold')
plt.ylabel('Minimum Nights', fontsize=12)
plt.xlabel('Annual Occupancy Percentage', fontsize=12)
plt.tight_layout()
plt.show()

**1\. Long-Term Stays Dominate Annual Occupancy**

*   **30-Day Stays are King:** The most significant finding is that bookings with a minimum stay of **30 nights** account for the highest annual occupancy percentage by a wide margin, reaching approximately **82%**.
    
*   **Strong Performance of 28-Day Stays:** The second-highest occupancy is for **28-night** minimum stays, at around **44%**.
    
*   **Conclusion:** This suggests a very strong market for monthly or near-monthly rentals, which could be driven by corporate housing, digital nomads, or temporary relocations. These long stays are the primary drivers of occupancy in this dataset.
    

**2\. Weekly and Long-Weekend Stays are Important Secondary Drivers**

*   **The "Sweet Spot" for Shorter Stays:** Minimum stays of **7 nights** and **3 nights** have very similar and significant occupancy percentages, both hovering around the **30-31%** mark.
    
*   **Conclusion:** There is substantial demand for weekly vacations and extended weekend trips. For hosts not targeting the monthly market, these two durations appear to be the most effective for maintaining occupancy.
    

**3\. Very Short Stays Contribute Less to Overall Occupancy**

*   **1-2 Night Stays:** While extremely common in the short-term rental market, minimum stays of **2 nights** (~26%) and **1 night** (~20%) contribute less to the total annual occupancy than the 3, 7, 28, and 30-night options.
    
*   **Implication:** This could indicate that while these bookings may be frequent, the higher turnover results in more unbooked "gap days," leading to a lower overall occupancy percentage over the course of a year compared to longer, more continuous stays.
    

**4\. Specific Durations are Less Effective**

*   **Low Occupancy for 4, 5, and 14 Nights:** Minimum stays of **4 nights** (~14%), **14 nights** (~4%), and **5 nights** (~3%) show the lowest annual occupancy percentages.
    
*   **Conclusion:** These specific booking windows seem to be less popular with guests or less successfully utilized by hosts compared to the standard 1-night, weekend (2-3 nights), weekly (7 nights), or monthly models.
    

**Strategic Implication for Hosts:** To maximize annual occupancy, the data suggests targeting the **30+ day market** is the most effective strategy. If that is not feasible, focusing on **7-night (weekly)** or **3-night (long weekend)** minimums would be better than setting minimums of 1, 2, 4, 5, or 14 nights.

**Key Insight** The data suggests that decreasing booking frequency (by raising minimum nights to 28+) actually increases your overall business performance (occupancy).

## Which listing types have the highest occupancy rates?

In [None]:
listing_types_vs_occupancy_rate = df.groupby('room_type')['annual_occupancy'].mean().sort_values(ascending=False).round(2)
listing_types_vs_occupancy_rate

**Entire Homes** are the most common listing type to be booked at a rate of **25.05%**, followed by **Private Room** at a rate of **15.08%** and lastly**Hotel Rooms** making up **3.0%** of the total occupancy rate

## How many listings per host on average?

In [None]:
no_of_hosts = df['host_id'].unique()
len(no_of_hosts)

Our dataset has 205 unique hosts all who own either one or a bunch of listings within Nairobi

In [None]:
no_of_listing_per_host = (
    df.groupby('host_id')
      .agg(
          host_name=('host_name', 'first'),
          number_of_listings=('listing_id', 'count')
      ).sort_values(by='number_of_listings',ascending=False)
)

no_of_listing_per_host

In [None]:
top_host = no_of_listing_per_host.iloc[0]
print(f"Top host: {top_host['host_name']} with {top_host['number_of_listings']} listings")

In [None]:
print(f"Total hosts: {len(no_of_listing_per_host)}")
print(f"Average listings per host: {no_of_listing_per_host['number_of_listings'].mean():.2f}")
print(f"Max listings by one host: {no_of_listing_per_host['number_of_listings'].max()}")
print(f"Min listings by one host: {no_of_listing_per_host['number_of_listings'].min()}")

The dataset contains **205 unique hosts**, with an **average of 1.46 listings per host**, indicating that the market is largely composed of small-scale hosts. Most hosts operate **a single listing**, as reflected by the minimum of one listing per host. However, there is evidence of **professional or portfolio-style hosting**, with the largest host, **Samra Apartments**, managing **19 listings**. This highlights a market structure dominated by individual hosts alongside a small number of high-volume operators.

# Geographic/Neighborhood Patterns

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(df['longitude'], df['latitude'], c=df['avg_rate_per_year'], alpha=0.6)
plt.colorbar(label='Average Nightly Rate')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Pricing Distribution Across Nairobi')
plt.show()

In [None]:
NAIROBI_BOUNDS = {
    "lat_min": -1.5,
    "lat_max": -1.1,
    "lon_min": 36.6,
    "lon_max": 37.1
}


In [None]:
df_nairobi = df[
    (df['latitude'] >= NAIROBI_BOUNDS['lat_min']) &
    (df['latitude'] <= NAIROBI_BOUNDS['lat_max']) &
    (df['longitude'] >= NAIROBI_BOUNDS['lon_min']) &
    (df['longitude'] <= NAIROBI_BOUNDS['lon_max'])
].copy()

print(f"Listings before: {len(df)}")
print(f"Listings in Nairobi: {len(df_nairobi)}")


In [None]:
import folium
from IPython.display import display

m = folium.Map(
    location=[-1.286389, 36.817223],  # Nairobi CBD
    zoom_start=12,
    min_zoom=11,
    max_zoom=16,
    max_bounds=True,
    bounds=NAIROBI_BOUNDS,
    tiles="OpenStreetMap"
)


In [None]:
#m.fit_bounds(NAIROBI_BOUNDS)

In [None]:
for row in df_nairobi.itertuples():
    if row.avg_rate_per_year < 5000:
        color = '#2EC4B6'   # Budget
    elif row.avg_rate_per_year < 10000:
        color = '#1F77B4'   # Mid-range
    elif row.avg_rate_per_year < 15000:
        color = '#FF9F1C'   # Premium
    else:
        color = '#E63946'   # Luxury

    # Create popup with more details
    popup_text = f"""
    <b style='font-size: 14px'>{row.listing_name}</b><br>
    <b>Type:</b> {row.listing_type}<br>
    <b>Room:</b> {row.room_type}<br>
    <b>Rating:</b> {row.rating_overall}/5.0<br>
    <b>Price:</b> KSh {row.avg_rate_per_year:.0f}/night
    """
    
    folium.CircleMarker(
        location=[row.latitude, row.longitude],
        radius=4,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        popup=folium.Popup(popup_text, max_width=300)
    ).add_to(m)

display(m)

In [None]:
from geopy.geocoders import Nominatim
import pandas as pd
from time import sleep

geolocator = Nominatim(user_agent="airbnb_nairobi_analysis")

def reverse_geocode(lat, lon):
    try:
        location = geolocator.reverse((lat, lon), exactly_one=True)
        if location:
            address = location.raw.get('address', {})
            return pd.Series({
                'neighbourhood': address.get('neighbourhood'),
                'suburb': address.get('suburb'),
                'county': address.get('county'),
                'city': address.get('city'),
                'postcode': address.get('postcode')
            })
    except:
        return pd.Series([None]*5)

# Apply to dataset (⚠️ slow — sample first)
df[['neighbourhood', 'suburb', 'county', 'city', 'postcode']] = (
    df[['latitude', 'longitude']]
    .apply(lambda x: reverse_geocode(x[0], x[1]), axis=1)
)


After executing the geocoding process, five new location-based columns were added to the dataset: **neighbourhood**, **suburb**, **county**, **city**, and **postcode**. While all five fields contain some missing values, **suburb** stands out as the most reliable geographic indicator, with only a single missing entry and **26 distinct areas** represented. As a result, suburb is used as the primary geographic dimension for analyzing spatial patterns within the dataset.

## Are there neighborhood clusters with similar pricing?

In [None]:
suburbs = df['suburb'].unique()
suburbs

In [None]:
neighbourhoods_vs_pricing = df.groupby('suburb')['avg_rate_per_year'].mean().sort_values(ascending=True).round(2).reset_index()
neighbourhoods_vs_pricing

#### Prepare data for clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

X = neighbourhoods_vs_pricing[['avg_rate_per_year']]

scaler = StandardScaler()
