# Airbnb Data Analysis

This notebook explores the cleaned and merged Airbnb dataset generated by the ETL pipeline.
We'll uncover insights about pricing, availability, and neighborhood trends.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pathlib import Path

plt.style.use("seaborn-v0_8-whitegrid")

In [None]:
data_path = Path("../data/processed/airbnb_merged.csv")
df = pd.read_csv(data_path, low_memory=False)
print(df.shape)
df.head()

In Airbnb data, price is a string like $123.00, so we clean it

In [None]:
df.info()
df.describe(include='all').T.head(10)

In [None]:
df["price"] = df["price"].replace('[\\$,]', '', regex=True).astype(float)

Price distribution

In [None]:
sns.histplot(df["price"], bins=50, kde=True)
plt.title("Distribution of Airbnb Prices")
plt.xlabel("Price ($)")
plt.show()

Average price by neighborhood

In [None]:
price_by_neigh = df.groupby("neighbourhood")["price"].mean().sort_values(ascending=False)
price_by_neigh.head(10).plot(kind="barh", figsize=(8,5), color="coral")
plt.title("Top 10 Neighborhoods by Average Price")
plt.xlabel("Average Price ($)")
plt.show()

Room type distribution

In [None]:
sns.countplot(y="room_type", data=df, order=df["room_type"].value_counts().index)
plt.title("Room Type Distribution")
plt.show()

Availability trends

In [None]:
availability = df.groupby("date")["available"].apply(lambda x: (x=="t").mean())
availability.plot(figsize=(10,4), title="Average Availability Over Time")
plt.ylabel("Availability Rate")
plt.show()

Review volume vs price

In [None]:
px.scatter(df, x="review_count", y="price", color="room_type",
           title="Review Volume vs Price by Room Type")