In [None]:
# Airbnb Data Analysis
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

print("Libraries imported successfully!")
print("Python Data Analysis Environment Ready")


In [None]:
# Load and examine the data
# First, let's create some sample Airbnb data for demonstration
np.random.seed(42)

# Generate sample data that resembles real Airbnb listings
n_listings = 1000

data = pd.DataFrame({
    'id': range(1, n_listings + 1),
    'neighbourhood_group': np.random.choice(['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Staten Island'], n_listings, p=[0.4, 0.3, 0.15, 0.1, 0.05]),
    'room_type': np.random.choice(['Entire home/apt', 'Private room', 'Shared room'], n_listings, p=[0.6, 0.35, 0.05]),
    'price': np.random.lognormal(mean=4.5, sigma=0.8, size=n_listings).astype(int),
    'minimum_nights': np.random.choice([1, 2, 3, 7, 30], n_listings, p=[0.4, 0.25, 0.15, 0.15, 0.05]),
    'number_of_reviews': np.random.poisson(lam=20, size=n_listings),
    'availability_365': np.random.randint(0, 366, n_listings)
})

# Clean the data
data['price'] = np.clip(data['price'], 20, 1000)  # Reasonable price range

print("Sample Airbnb Dataset Created!")
print(f"Dataset shape: {data.shape}")
print("\nFirst few rows:")
print(data.head())


In [None]:
# Basic Data Analysis and Statistics
print("=== DATASET OVERVIEW ===")
print(f"Total number of listings: {len(data):,}")
print(f"Number of columns: {len(data.columns)}")
print(f"Memory usage: {data.memory_usage(deep=True).sum() / 1024:.1f} KB")

print("\n=== BASIC STATISTICS ===")
print(data.describe())

print("\n=== MISSING VALUES ===")
missing_values = data.isnull().sum()
print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values found!")

print("\n=== DATA TYPES ===")
print(data.dtypes)


In [None]:
# Comprehensive Data Visualizations
plt.style.use('seaborn-v0_8')
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# 1. Room type distribution (pie chart)
room_counts = data['room_type'].value_counts()
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
axes[0, 0].pie(room_counts.values, 
               labels=room_counts.index, 
               autopct='%1.1f%%',
               startangle=90,
               colors=colors,
               explode=(0.05, 0.05, 0.05))
axes[0, 0].set_title('Room Type Distribution', fontsize=14, fontweight='bold')

# 2. Price distribution by neighbourhood group (box plot)
sns.boxplot(data=data, x='neighbourhood_group', y='price', ax=axes[0, 1], palette='Set2')
axes[0, 1].set_title('Price Distribution by Neighbourhood Group', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Neighbourhood Group', fontsize=12)
axes[0, 1].set_ylabel('Price ($)', fontsize=12)
axes[0, 1].tick_params(axis='x', rotation=45)

# 3. Availability by room type (violin plot)
sns.violinplot(data=data, x='room_type', y='availability_365', ax=axes[1, 0], palette='viridis')
axes[1, 0].set_title('Availability Distribution by Room Type', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Room Type', fontsize=12)
axes[1, 0].set_ylabel('Availability (days/year)', fontsize=12)
axes[1, 0].tick_params(axis='x', rotation=45)

# 4. Number of reviews vs price (scatter plot)
scatter = axes[1, 1].scatter(data['number_of_reviews'], data['price'], 
                           alpha=0.6, s=30, c=data['price'], 
                           cmap='coolwarm', edgecolors='black', linewidth=0.5)
axes[1, 1].set_xlabel('Number of Reviews', fontsize=12)
axes[1, 1].set_ylabel('Price ($)', fontsize=12)
axes[1, 1].set_title('Number of Reviews vs Price', fontsize=14, fontweight='bold')
plt.colorbar(scatter, ax=axes[1, 1], label='Price ($)')

plt.tight_layout()
plt.show()

# Print key insights
print("\n" + "="*50)
print("🏠 KEY INSIGHTS FROM THE DATA")
print("="*50)
print(f"📊 Total listings analyzed: {len(data):,}")
print(f"💰 Average price: ${data['price'].mean():.2f}")
print(f"🏆 Most expensive listing: ${data['price'].max():.2f}")
print(f"🏠 Most common room type: {data['room_type'].mode()[0]}")
print(f"🏙️ Most popular neighbourhood: {data['neighbourhood_group'].mode()[0]}")
print(f"📈 Average availability: {data['availability_365'].mean():.0f} days/year")
print(f"⭐ Average reviews: {data['number_of_reviews'].mean():.1f}")


In [None]:
# Advanced Analysis: Price Insights
print("🔍 ADVANCED PRICE ANALYSIS")
print("="*40)

# Price statistics by room type
price_by_room = data.groupby('room_type')['price'].agg(['mean', 'median', 'std', 'min', 'max']).round(2)
print("\n💰 Price Statistics by Room Type:")
print(price_by_room)

# Price statistics by neighbourhood
price_by_area = data.groupby('neighbourhood_group')['price'].agg(['mean', 'median', 'count']).round(2)
print("\n🏙️ Price Statistics by Neighbourhood Group:")
print(price_by_area)

# Find expensive and cheap listings
expensive_threshold = data['price'].quantile(0.9)
cheap_threshold = data['price'].quantile(0.1)

print(f"\n📈 Premium listings (top 10%): ${expensive_threshold:.0f}+")
print(f"📉 Budget listings (bottom 10%): ${cheap_threshold:.0f} or less")

# Correlation analysis
correlation_matrix = data[['price', 'number_of_reviews', 'availability_365', 'minimum_nights']].corr()
print("\n🔗 Correlation Matrix:")
print(correlation_matrix.round(3))
