In [None]:
import pandas as pd

df_nyc = pd.read_csv("../data/listings_nyc.csv.gz")
df_bj = pd.read_csv("../data/listings_beijing.csv.gz")
df_syd = pd.read_csv("../data/listings_sydney.csv.gz")

if "city" not in df_nyc.columns:
    df_nyc["city"] = "New York"
    df_bj["city"] = "Beijing"
    df_syd["city"] = "Sydney"

df_all = pd.concat([df_nyc, df_bj, df_syd], axis=0)
print("Total rows:", len(df_all))
print("\nColumn names:")
print(df_all.columns.tolist())

print("\nTop 10 columns with the most missing values:")
print(df_all.isnull().sum().sort_values(ascending=False).head(10))

#preview first 5 rows
df_all.head()


In [None]:
df_all = df_all[df_all["price"].notnull()]
df_all = df_all[df_all["price"] != ""]

def clean_price(value):
    if isinstance(value, str):
        return float(value.replace('$', '').replace('¥', '').replace(',', '').strip())
    else:
        return float(value)

df_all["price"] = df_all["price"].apply(clean_price)

df_all = df_all[df_all["price"].notnull()]

print("Price column cleaned. Sample values:")
print(df_all[["city","price"]].head())




In [None]:
# print("Number of listings per city:")
# print(df_all["city"].value_counts())

avg_price_by_city = df_all.groupby("city")["price"].mean()
print("Averge price by city:")
print(avg_price_by_city)

In [None]:
# Price distribution by city

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8,6))               # size
sns.boxplot(x="city", y="price", data=df_all)   #use seaborn will more easy and clean
plt.title("Price Distribution by City") # the title
plt.ylabel("Price")         #change the new y label
plt.xlabel("City")          #change the new x label
plt.show()                  #show plot

In [None]:
stats = df_all.groupby("city")["price"].agg(Count='count',Mean='mean',Median='median',Standard_Deviation='std',Min='min',Max='max')  #aggregate  and also change the name
print(stats)


In [None]:
#Compare different room type avg price
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

room_price = df_all.groupby("room_type")["price"].mean().reset_index()

plt.figure(figsize=(8, 5))
sns.barplot(data=room_price, x="room_type", y="price", palette="viridis")
plt.title("Avg price Vs Room Type")
plt.xlabel("Room Type")
plt.ylabel("Average Price")
plt.xticks(rotation=15)     # rotate x axis name
plt.tight_layout()          # auto adjust the gap and hidden
plt.show()



In [None]:
# Geographic Distribution

plt.figure(figsize=(8, 5))       # Set the figure size to 8 inches wide by 6 inches tall
sns.scatterplot(data=df_all.sample(1000), x="longitude", y="latitude", hue="city", alpha=0.4, s=10)
plt.title("Geographic Distribution (Sample of 1000)")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend()        # add colour to different city pot
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))  # Set the figure size to 10 inches wide by 6 inches tall
sns.boxplot(
    x="city", 
    y="price", 
    data=df_all[df_all["price"] < 500]  # Exclude listings with price greater than $500 to remove outliers
)
plt.title("Price Distribution by City (Excluding Listings Over $500)")  # Set the plot title
plt.ylabel("Price (USD)")  # Label for the Y-axis
plt.xlabel("City")  # Label for the X-axis
plt.tight_layout()  # Automatically adjust layout to prevent overlap
plt.show()  # Display the plot



In [None]:
plt.figure(figsize=(12, 6))
sns.barplot(
    data=df_all[df_all["price"] < 500], x="room_type", y="price", hue="city", ci=None)      # confidence interval 误差
plt.title("Average Price by City and Room Type")
plt.ylabel("Average Price (USD)")
plt.xlabel("Room Type")
plt.legend(title="City")
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

df_all["city"].value_counts().plot(kind='bar')
plt.title("Number of Listings by City")
plt.xlabel("City")
plt.ylabel("Number of Listings")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

avg_price_by_city = df_all.groupby("city")["price"].mean().reset_index()

plt.figure(figsize=(8,6))
sns.barplot(data=avg_price_by_city, x="city", y="price")
plt.title("Average Price by City")
plt.xlabel("City")
plt.ylabel("Average Price (USD)")
plt.tight_layout()
plt.show()
