In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
df = pd.read_csv('data/listings.csv')

# Check basic info
df.info()

# Show first rows
df.head()


In [15]:
# Remove $ and commas in 'price' and convert to float
df['price'] = df['price'].replace(r'[\$,]', '', regex=True).astype(float)

# Check for nulls
df.isnull().sum().sort_values(ascending=False).head(10)

# Drop columns with too many missing values
df = df.drop(columns=['license', 'neighbourhood_group_cleansed'], errors='ignore')

# Fill or drop remaining nulls
df = df.dropna(subset=['price', 'room_type', 'neighbourhood_cleansed'])

# Optional: filter out crazy prices
df = df[df['price'] < 1000]


In [None]:
# Save cleaned data to a new CSV file
df.to_csv('data/cleaned_listings.csv', index=False)


In [None]:
# Function to clean Airbnb data
# This function reads the CSV file, cleans the 'price' column, drops unnecessary columns,
# and filters out rows with missing values or unrealistic prices.
# It returns a cleaned DataFrame ready for analysis.

def clean_airbnb_data(filepath):
    df = pd.read_csv(filepath, low_memory=False)
    df['price'] = df['price'].replace(r'[\$,]', '', regex=True).astype(float)
    df = df.drop(columns=['license', 'neighbourhood_group_cleansed'], errors='ignore')
    df = df.dropna(subset=['price', 'room_type', 'neighbourhood_cleansed'])
    df = df[df['price'] < 1000]
    return df


df = clean_airbnb_data("data/listings.csv")
df.to_csv("data/cleaned_listings.csv", index=False)


In [None]:
#Price Distribution
plt.figure(figsize=(10, 5))
sns.histplot(df['price'], bins=50, kde=True)
plt.title('Airbnb Price Distribution')
plt.xlabel('Price ($)')
plt.ylabel('Count')
plt.show()


In [None]:
# Average Price by Room Type
room_prices = df.groupby('room_type')['price'].mean().sort_values()

plt.figure(figsize=(8, 5))
room_prices.plot(kind='bar', color='skyblue')
plt.title('Average Price by Room Type')
plt.ylabel('Average Price ($)')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()


In [None]:
# Top 10 Most Expensive Neighborhoods
top_hoods = df.groupby('neighbourhood_cleansed')['price'].mean().sort_values(ascending=False).head(10)

plt.figure(figsize=(10, 6))
top_hoods.plot(kind='bar', color='orange')
plt.title('Top 10 Most Expensive Neighborhoods')
plt.ylabel('Average Price ($)')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()


In [None]:
# Scatter plot of Price vs. Number of Reviews
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x='number_of_reviews', y='price', alpha=0.5)
plt.title('Price vs. Number of Reviews')
plt.xlabel('Number of Reviews')
plt.ylabel('Price ($)')
plt.show()


In [23]:
# Linear Regression Model to Predict Price

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Select features
features = ['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']
df_model = df[features + ['price']].dropna()

# Split data
X = df_model[features]
y = df_model['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict & evaluate
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred) ** 0.5

print(f"RMSE: ${rmse:.2f}")


RMSE: $129.00
