# NYC Airbnb Complete Data Analysis
## Comprehensive Data Science Project with Machine Learning

**Analysis Sections:**
1. Library Imports and Setup
2. Data Loading and Initial Inspection
3. Missing Values Analysis
4. Outlier Detection and Visualization
5. Outlier Removal and Data Cleaning
6. Exploratory Data Analysis - Basic Statistics
7. Exploratory Data Analysis - Visualizations
8. Feature Engineering for Machine Learning
9. K-means Clustering Analysis
10. Cluster Visualization and Analysis
11. Minimum Distance Classifier Implementation
12. Principal Component Analysis (PCA)
13. PCA Visualization and Feature Analysis
14. Machine Learning Models - Price Prediction
15. Model Visualization and Performance Analysis
16. Advanced Analytics and Insights
17. Final Summary and Conclusions

---
## 1. Library Imports and Setup
All required imports in one cell, environment configuration, clear topic comment at top

In [None]:
# TOPIC: Library Imports and Setup
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Machine learning libraries
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix, silhouette_score

# Statistical analysis
from scipy import stats
from scipy.spatial.distance import cdist

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Environment configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
plt.style.use('default')
sns.set_palette("husl")

print("All libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Matplotlib version: {plt.matplotlib.__version__}")

---
## 2. Data Loading and Initial Inspection
Dataset loading and overview, basic statistics and info, shape and column analysis

In [None]:
# TOPIC: Data Loading and Initial Inspection
try:
    df = pd.read_csv('AB_NYC_2019.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Dataset file not found. Creating sample data for demonstration...")
    # Create comprehensive sample data
    np.random.seed(42)
    n_samples = 2000
    df = pd.DataFrame({
        'id': range(1, n_samples + 1),
        'name': [f'Listing_{i}' for i in range(1, n_samples + 1)],
        'host_id': np.random.randint(1, 800, n_samples),
        'host_name': [f'Host_{i}' for i in np.random.randint(1, 800, n_samples)],
        'neighbourhood_group': np.random.choice(['Manhattan', 'Brooklyn', 'Queens', 'Bronx', 'Staten Island'], n_samples, p=[0.35, 0.25, 0.2, 0.15, 0.05]),
        'neighbourhood': [f'Neighborhood_{i}' for i in np.random.randint(1, 150, n_samples)],
        'latitude': np.random.uniform(40.5, 40.9, n_samples),
        'longitude': np.random.uniform(-74.3, -73.7, n_samples),
        'room_type': np.random.choice(['Entire home/apt', 'Private room', 'Shared room'], n_samples, p=[0.52, 0.45, 0.03]),
        'price': np.random.lognormal(4.5, 0.8, n_samples).astype(int),
        'minimum_nights': np.random.choice([1, 2, 3, 7, 30], n_samples, p=[0.4, 0.3, 0.15, 0.1, 0.05]),
        'number_of_reviews': np.random.poisson(25, n_samples),
        'last_review': pd.date_range('2019-01-01', '2019-12-31', periods=n_samples),
        'reviews_per_month': np.random.uniform(0, 6, n_samples),
        'calculated_host_listings_count': np.random.poisson(3, n_samples),
        'availability_365': np.random.randint(0, 366, n_samples)
    })
    # Add realistic missing values
    df.loc[np.random.choice(df.index, 100), 'reviews_per_month'] = np.nan
    df.loc[np.random.choice(df.index, 80), 'last_review'] = pd.NaT
    print("Using comprehensive sample dataset")

# Dataset overview
print(f"\nDataset Shape: {df.shape}")
print(f"Rows: {df.shape[0]:,}, Columns: {df.shape[1]}")
print(f"\nColumn Names: {list(df.columns)}")
print(f"\nData Types:")
print(df.dtypes)

# Display first few rows
print("\nFirst 5 rows:")
display(df.head())

# Basic info
print("\nDataset Info:")
df.info()

# Basic statistics
print("\nBasic Statistics:")
display(df.describe())

---
## 3. Missing Values Analysis
Comprehensive missing value detection, visualization of missing patterns, strategic handling approach

In [None]:
# TOPIC: Missing Values Analysis
print("COMPREHENSIVE MISSING VALUES ANALYSIS")
print("=" * 50)

# 1. Missing values count and percentage
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum(),
    'Missing_Percentage': (df.isnull().sum() / len(df)) * 100,
    'Data_Type': df.dtypes
})
missing_data = missing_data[missing_data['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

print("\nMissing Values Summary:")
if len(missing_data) > 0:
    display(missing_data)
    print(f"\nTotal columns with missing values: {len(missing_data)}")
    print(f"Total missing values: {df.isnull().sum().sum():,}")
    print(f"Percentage of dataset with missing values: {(df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100:.2f}%")
else:
    print("No missing values found in the dataset!")

# 2. Missing value visualization
if len(missing_data) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    fig.suptitle('Missing Values Analysis', fontsize=16, fontweight='bold')
    
    # Missing values bar chart
    missing_counts = df.isnull().sum()[df.isnull().sum() > 0]
    if len(missing_counts) > 0:
        missing_counts.plot(kind='bar', ax=axes[0], color='coral')
        axes[0].set_title('Missing Values Count by Column')
        axes[0].set_xlabel('Columns')
        axes[0].set_ylabel('Missing Count')
        axes[0].tick_params(axis='x', rotation=45)
    
    # Data completeness pie chart
    complete_rows = df.dropna().shape[0]
    incomplete_rows = df.shape[0] - complete_rows
    
    labels = ['Complete Rows', 'Incomplete Rows']
    sizes = [complete_rows, incomplete_rows]
    colors = ['lightgreen', 'lightcoral']
    
    axes[1].pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
    axes[1].set_title('Data Completeness Overview')
    
    plt.tight_layout()
    plt.show()

print("\n" + "=" * 60)
print("MISSING VALUES ANALYSIS COMPLETED")
print("=" * 60)

---
## 4. Outlier Detection and Visualization
Comprehensive outlier analysis using multiple statistical methods, advanced visualizations, and impact assessment

In [None]:
# TOPIC: Outlier Detection and Visualization
print("COMPREHENSIVE OUTLIER DETECTION ANALYSIS")
print("=" * 55)

# 1. Identify numerical columns for outlier analysis
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumerical columns identified for outlier analysis: {len(numerical_cols)}")
print(f"Columns: {numerical_cols}")

# 2. Multiple outlier detection methods
outlier_results = {}

print("\nOUTLIER DETECTION METHODS COMPARISON:")
print("=" * 45)

for col in numerical_cols:
    if df[col].notna().sum() > 0:  # Only analyze columns with data
        print(f"\nAnalyzing column: {col}")
        print("-" * 30)
        
        # Remove missing values for analysis
        col_data = df[col].dropna()
        
        # Method 1: IQR Method
        Q1 = col_data.quantile(0.25)
        Q3 = col_data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        iqr_outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
        
        # Method 2: Z-Score Method (|z| > 3)
        z_scores = np.abs(stats.zscore(col_data))
        zscore_outliers = col_data[z_scores > 3]
        
        # Store results
        outlier_results[col] = {
            'total_values': len(col_data),
            'iqr_outliers': len(iqr_outliers),
            'zscore_outliers': len(zscore_outliers),
            'iqr_bounds': (lower_bound, upper_bound)
        }
        
        # Display results for this column
        print(f"   Total values: {len(col_data):,}")
        print(f"   IQR Method: {len(iqr_outliers):,} outliers ({len(iqr_outliers)/len(col_data)*100:.2f}%)")
        print(f"      Bounds: [{lower_bound:.2f}, {upper_bound:.2f}]")
        print(f"   Z-Score Method: {len(zscore_outliers):,} outliers ({len(zscore_outliers)/len(col_data)*100:.2f}%)")
        print(f"   Statistics: Mean={col_data.mean():.2f}, Median={col_data.median():.2f}, Std={col_data.std():.2f}")

# 3. Outlier visualization for key columns
print("\nOUTLIER VISUALIZATION DASHBOARD")
print("=" * 35)

# Visualize outliers for first few numerical columns
for col in numerical_cols[:3]:  # Limit to first 3 columns
    if df[col].notna().sum() > 0:
        print(f"\nCreating visualizations for: {col}")
        
        col_data = df[col].dropna()
        
        # Create subplot for this column
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        fig.suptitle(f'Outlier Analysis: {col}', fontsize=14, fontweight='bold')
        
        # 1. Box Plot
        axes[0].boxplot(col_data, vert=True)
        axes[0].set_title('Box Plot')
        axes[0].set_ylabel('Values')
        axes[0].grid(True, alpha=0.3)
        
        # 2. Histogram
        axes[1].hist(col_data, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
        axes[1].axvline(col_data.mean(), color='red', linestyle='--', label=f'Mean: {col_data.mean():.2f}')
        axes[1].axvline(col_data.median(), color='green', linestyle='--', label=f'Median: {col_data.median():.2f}')
        axes[1].set_title('Distribution')
        axes[1].set_xlabel('Values')
        axes[1].set_ylabel('Frequency')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
        
        # 3. Z-Score scatter
        z_scores = np.abs(stats.zscore(col_data))
        axes[2].scatter(range(len(z_scores)), z_scores, alpha=0.6, s=1)
        axes[2].axhline(y=3, color='red', linestyle='--', label='Z-Score = 3')
        axes[2].axhline(y=2, color='orange', linestyle='--', label='Z-Score = 2')
        axes[2].set_title('Z-Score Analysis')
        axes[2].set_xlabel('Data Point Index')
        axes[2].set_ylabel('|Z-Score|')
        axes[2].legend()
        axes[2].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

# 4. Outlier summary
print("\nOUTLIER SUMMARY:")
outlier_summary = pd.DataFrame({
    'Column': list(outlier_results.keys()),
    'Total_Values': [outlier_results[col]['total_values'] for col in outlier_results.keys()],
    'IQR_Outliers': [outlier_results[col]['iqr_outliers'] for col in outlier_results.keys()],
    'IQR_Percentage': [outlier_results[col]['iqr_outliers']/outlier_results[col]['total_values']*100 for col in outlier_results.keys()],
    'Z_Score_Outliers': [outlier_results[col]['zscore_outliers'] for col in outlier_results.keys()]
})

display(outlier_summary.round(2))

print("\n" + "=" * 60)
print("OUTLIER DETECTION ANALYSIS COMPLETED")
print("=" * 60)

---
## 5. Outlier Removal and Data Cleaning
Systematic outlier removal, before/after comparison, data quality validation and cleaned dataset preparation

In [None]:
# TOPIC: Outlier Removal and Data Cleaning
print("COMPREHENSIVE OUTLIER REMOVAL AND DATA CLEANING")
print("=" * 60)

# Create a copy of the original dataset for comparison
df_original = df.copy()
print(f"Original dataset shape: {df_original.shape}")

# 1. Define outlier removal strategy based on previous analysis
print("\nOUTLIER REMOVAL STRATEGY:")
print("=" * 30)

# Store removal statistics
removal_stats = {}

# Apply IQR method for outlier removal on key numerical columns
for col in numerical_cols:
    if df[col].notna().sum() > 0:
        print(f"\nProcessing column: {col}")
        
        # Calculate IQR bounds
        col_data = df[col].dropna()
        Q1 = col_data.quantile(0.25)
        Q3 = col_data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Count outliers before removal
        outliers_before = len(df[(df[col] < lower_bound) | (df[col] > upper_bound)])
        outlier_percentage = (outliers_before / len(df)) * 100
        
        print(f"   Outliers detected: {outliers_before} ({outlier_percentage:.2f}%)")
        print(f"   IQR bounds: [{lower_bound:.2f}, {upper_bound:.2f}]")
        
        # Decision logic for outlier removal
        if outlier_percentage < 5:  # Remove outliers if less than 5%
            rows_before = len(df)
            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
            rows_after = len(df)
            rows_removed = rows_before - rows_after
            
            removal_stats[col] = {
                'outliers_detected': outliers_before,
                'rows_removed': rows_removed,
                'removal_percentage': (rows_removed / rows_before) * 100,
                'action': 'Removed',
                'bounds': (lower_bound, upper_bound)
            }
            
            print(f"   Action: REMOVED {rows_removed} rows ({(rows_removed/rows_before)*100:.2f}%)")
            
        elif outlier_percentage < 10:  # Cap outliers if 5-10%
            # Winsorization - cap extreme values
            outliers_capped = 0
            
            # Cap lower outliers
            lower_outliers = df[col] < lower_bound
            if lower_outliers.sum() > 0:
                df.loc[lower_outliers, col] = lower_bound
                outliers_capped += lower_outliers.sum()
            
            # Cap upper outliers
            upper_outliers = df[col] > upper_bound
            if upper_outliers.sum() > 0:
                df.loc[upper_outliers, col] = upper_bound
                outliers_capped += upper_outliers.sum()
            
            removal_stats[col] = {
                'outliers_detected': outliers_before,
                'rows_removed': 0,
                'outliers_capped': outliers_capped,
                'action': 'Capped',
                'bounds': (lower_bound, upper_bound)
            }
            
            print(f"   Action: CAPPED {outliers_capped} values to bounds")
            
        else:  # Keep outliers if more than 10%
            removal_stats[col] = {
                'outliers_detected': outliers_before,
                'rows_removed': 0,
                'action': 'Kept - High percentage',
                'bounds': (lower_bound, upper_bound)
            }
            
            print(f"   Action: KEPT - Too many outliers ({outlier_percentage:.2f}%)")

print(f"\nDataset shape after outlier removal: {df.shape}")
print(f"Rows removed: {len(df_original) - len(df)} ({((len(df_original) - len(df))/len(df_original))*100:.2f}%)")

# 2. Data quality validation and summary
print("\n" + "=" * 40)
print("DATA CLEANING SUMMARY REPORT")
print("=" * 40)

print(f"\nOriginal dataset: {df_original.shape[0]:,} rows, {df_original.shape[1]} columns")
print(f"Cleaned dataset: {df.shape[0]:,} rows, {df.shape[1]} columns")
print(f"Rows removed: {len(df_original) - len(df):,} ({((len(df_original) - len(df))/len(df_original))*100:.2f}%)")
print(f"Data retention rate: {(len(df)/len(df_original))*100:.2f}%")

# Final data quality check
print("\nFinal data quality check:")
print(f"   Missing values: {df.isnull().sum().sum()}")
print(f"   Duplicate rows: {df.duplicated().sum()}")
print(f"   Data types: {len(df.select_dtypes(include=[np.number]).columns)} numerical, {len(df.select_dtypes(include=['object']).columns)} categorical")

print("\nCleaned dataset ready for analysis and modeling!")
print("Key improvements:")
print("   - Outliers handled systematically")
print("   - Statistical distributions improved")
print("   - Data quality validated")

print("\n" + "=" * 60)
print("OUTLIER REMOVAL AND DATA CLEANING COMPLETED")
print("=" * 60)

---
## 6. Exploratory Data Analysis - Basic Statistics
Comprehensive statistical analysis by categories, price patterns, and market insights

In [None]:
# TOPIC: Exploratory Data Analysis - Basic Statistics
print("COMPREHENSIVE EXPLORATORY DATA ANALYSIS - BASIC STATISTICS")
print("=" * 70)

# 1. Overall dataset statistics
print("\nOVERALL DATASET STATISTICS:")
print("=" * 35)
print(f"Total listings: {len(df):,}")
print(f"Total hosts: {df['host_id'].nunique():,}")
print(f"Date range: {df['last_review'].min()} to {df['last_review'].max()}")
print(f"Price range: ${df['price'].min()} - ${df['price'].max()}")
print(f"Average price: ${df['price'].mean():.2f}")
print(f"Median price: ${df['price'].median():.2f}")

# 2. Analysis by Borough (neighbourhood_group)
print("\n" + "=" * 50)
print("ANALYSIS BY BOROUGH (NEIGHBOURHOOD GROUP)")
print("=" * 50)

borough_stats = df.groupby('neighbourhood_group').agg({
    'price': ['count', 'mean', 'median', 'std', 'min', 'max'],
    'number_of_reviews': ['mean', 'median'],
    'availability_365': ['mean', 'median'],
    'calculated_host_listings_count': ['mean', 'median']
}).round(2)

# Flatten column names
borough_stats.columns = ['_'.join(col).strip() for col in borough_stats.columns]

print("\nBorough Statistics Summary:")
display(borough_stats)

# Market share by borough
borough_market_share = df['neighbourhood_group'].value_counts()
borough_market_share_pct = df['neighbourhood_group'].value_counts(normalize=True) * 100

print("\nMarket Share by Borough:")
market_share_df = pd.DataFrame({
    'Listings': borough_market_share,
    'Percentage': borough_market_share_pct.round(2)
})
display(market_share_df)

# 3. Analysis by Room Type
print("\n" + "=" * 40)
print("ANALYSIS BY ROOM TYPE")
print("=" * 40)

room_type_stats = df.groupby('room_type').agg({
    'price': ['count', 'mean', 'median', 'std', 'min', 'max'],
    'number_of_reviews': ['mean', 'median'],
    'availability_365': ['mean', 'median'],
    'minimum_nights': ['mean', 'median']
}).round(2)

# Flatten column names
room_type_stats.columns = ['_'.join(col).strip() for col in room_type_stats.columns]

print("\nRoom Type Statistics Summary:")
display(room_type_stats)

# Market share by room type
room_type_market_share = df['room_type'].value_counts()
room_type_market_share_pct = df['room_type'].value_counts(normalize=True) * 100

print("\nMarket Share by Room Type:")
room_market_share_df = pd.DataFrame({
    'Listings': room_type_market_share,
    'Percentage': room_type_market_share_pct.round(2)
})
display(room_market_share_df)

# 4. Price Analysis by Borough and Room Type
print("\n" + "=" * 50)
print("PRICE ANALYSIS BY BOROUGH AND ROOM TYPE")
print("=" * 50)

# Cross-tabulation of average prices
price_crosstab = pd.crosstab(df['neighbourhood_group'], df['room_type'], 
                            values=df['price'], aggfunc='mean').round(2)

print("\nAverage Price by Borough and Room Type:")
display(price_crosstab)

# Count of listings by borough and room type
count_crosstab = pd.crosstab(df['neighbourhood_group'], df['room_type'])

print("\nNumber of Listings by Borough and Room Type:")
display(count_crosstab)

# 5. Host Analysis
print("\n" + "=" * 30)
print("HOST ANALYSIS")
print("=" * 30)

# Host statistics
host_stats = df.groupby('host_id').agg({
    'id': 'count',  # Number of listings per host
    'price': ['mean', 'median'],
    'number_of_reviews': 'sum'
}).round(2)

host_stats.columns = ['listings_count', 'avg_price', 'median_price', 'total_reviews']
host_stats = host_stats.reset_index()

print(f"\nTotal unique hosts: {len(host_stats):,}")
print(f"Average listings per host: {host_stats['listings_count'].mean():.2f}")
print(f"Median listings per host: {host_stats['listings_count'].median():.2f}")
print(f"Max listings by single host: {host_stats['listings_count'].max()}")

# Top hosts by number of listings
top_hosts = host_stats.nlargest(10, 'listings_count')
print("\nTop 10 Hosts by Number of Listings:")
display(top_hosts[['host_id', 'listings_count', 'avg_price', 'total_reviews']])

# Host distribution analysis
host_distribution = host_stats['listings_count'].value_counts().sort_index()
print("\nHost Distribution (Number of Listings):")
for listings, count in host_distribution.head(10).items():
    percentage = (count / len(host_stats)) * 100
    print(f"   {listings} listing(s): {count:,} hosts ({percentage:.2f}%)")

# 6. Reviews and Availability Analysis
print("\n" + "=" * 40)
print("REVIEWS AND AVAILABILITY ANALYSIS")
print("=" * 40)

# Reviews statistics
print("\nReviews Statistics:")
print(f"   Total reviews across all listings: {df['number_of_reviews'].sum():,}")
print(f"   Average reviews per listing: {df['number_of_reviews'].mean():.2f}")
print(f"   Median reviews per listing: {df['number_of_reviews'].median():.2f}")
print(f"   Listings with no reviews: {(df['number_of_reviews'] == 0).sum():,} ({((df['number_of_reviews'] == 0).sum()/len(df)*100):.2f}%)")

# Availability statistics
print("\nAvailability Statistics:")
print(f"   Average availability per year: {df['availability_365'].mean():.2f} days")
print(f"   Median availability per year: {df['availability_365'].median():.2f} days")
print(f"   Fully available listings (365 days): {(df['availability_365'] == 365).sum():,} ({((df['availability_365'] == 365).sum()/len(df)*100):.2f}%)")
print(f"   Unavailable listings (0 days): {(df['availability_365'] == 0).sum():,} ({((df['availability_365'] == 0).sum()/len(df)*100):.2f}%)")

# 7. Minimum Nights Analysis
print("\n" + "=" * 35)
print("MINIMUM NIGHTS ANALYSIS")
print("=" * 35)

min_nights_stats = df['minimum_nights'].value_counts().sort_index()
print("\nMinimum Nights Distribution:")
for nights, count in min_nights_stats.head(10).items():
    percentage = (count / len(df)) * 100
    print(f"   {nights} night(s): {count:,} listings ({percentage:.2f}%)")

print(f"\nAverage minimum nights: {df['minimum_nights'].mean():.2f}")
print(f"Median minimum nights: {df['minimum_nights'].median():.2f}")
print(f"Maximum minimum nights: {df['minimum_nights'].max()}")

# 8. Geographic Distribution Analysis
print("\n" + "=" * 40)
print("GEOGRAPHIC DISTRIBUTION ANALYSIS")
print("=" * 40)

# Neighborhood analysis (top neighborhoods)
neighborhood_stats = df.groupby('neighbourhood').agg({
    'price': ['count', 'mean', 'median'],
    'number_of_reviews': 'mean'
}).round(2)

neighborhood_stats.columns = ['listing_count', 'avg_price', 'median_price', 'avg_reviews']
neighborhood_stats = neighborhood_stats.reset_index()

# Top neighborhoods by number of listings
top_neighborhoods = neighborhood_stats.nlargest(15, 'listing_count')
print("\nTop 15 Neighborhoods by Number of Listings:")
display(top_neighborhoods)

# Geographic coordinates statistics
print("\nGeographic Coordinates Statistics:")
print(f"   Latitude range: {df['latitude'].min():.4f} to {df['latitude'].max():.4f}")
print(f"   Longitude range: {df['longitude'].min():.4f} to {df['longitude'].max():.4f}")
print(f"   Geographic center: ({df['latitude'].mean():.4f}, {df['longitude'].mean():.4f})")

# 9. Price Distribution Analysis
print("\n" + "=" * 35)
print("PRICE DISTRIBUTION ANALYSIS")
print("=" * 35)

# Price percentiles
price_percentiles = df['price'].quantile([0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
print("\nPrice Percentiles:")
for percentile, price in price_percentiles.items():
    print(f"   {percentile*100:2.0f}th percentile: ${price:.2f}")

# Price categories
price_categories = pd.cut(df['price'], 
                         bins=[0, 50, 100, 200, 500, float('inf')], 
                         labels=['Budget (<$50)', 'Economy ($50-100)', 'Mid-range ($100-200)', 
                                'Premium ($200-500)', 'Luxury (>$500)'])

price_category_stats = price_categories.value_counts()
price_category_pct = price_categories.value_counts(normalize=True) * 100

print("\nPrice Category Distribution:")
price_cat_df = pd.DataFrame({
    'Listings': price_category_stats,
    'Percentage': price_category_pct.round(2)
})
display(price_cat_df)

# 10. Summary Statistics Table
print("\n" + "=" * 40)
print("COMPREHENSIVE SUMMARY STATISTICS")
print("=" * 40)

# Create comprehensive summary
summary_stats = df[['price', 'number_of_reviews', 'availability_365', 
                   'minimum_nights', 'calculated_host_listings_count']].describe()

print("\nNumerical Variables Summary:")
display(summary_stats.round(2))

# Categorical variables summary
print("\nCategorical Variables Summary:")
categorical_summary = pd.DataFrame({
    'Variable': ['neighbourhood_group', 'room_type', 'neighbourhood'],
    'Unique_Values': [df['neighbourhood_group'].nunique(), 
                     df['room_type'].nunique(), 
                     df['neighbourhood'].nunique()],
    'Most_Common': [df['neighbourhood_group'].mode().iloc[0],
                   df['room_type'].mode().iloc[0],
                   df['neighbourhood'].mode().iloc[0]],
    'Most_Common_Count': [df['neighbourhood_group'].value_counts().iloc[0],
                         df['room_type'].value_counts().iloc[0],
                         df['neighbourhood'].value_counts().iloc[0]]
})

display(categorical_summary)

print("\n" + "=" * 60)
print("EXPLORATORY DATA ANALYSIS - BASIC STATISTICS COMPLETED")
print("=" * 60)