In [6]:
# ==================== CELL 0: CREATE FOLDER STRUCTURE ====================
"""
SETUP: Create all necessary folders for the project
JUSTIFICATION: Ensures all output directories exist before saving files
NOTE: Creates folders in PROJECT ROOT, not in notebooks folder
"""

import os
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Get project root directory (go up one level from notebooks folder)
notebook_dir = os.getcwd()
print(f"Current notebook location: {notebook_dir}")

if notebook_dir.endswith('notebooks'):
    project_root = os.path.dirname(notebook_dir)
else:
    project_root = notebook_dir

print(f"Project root: {project_root}")

# Change working directory to project root
os.chdir(project_root)
print(f"Working directory changed to: {os.getcwd()}\n")

# Create folder structure (now relative to project root)
folders = [
    'data/raw',
    'data/processed',
    'visualizations/eda',
    'visualizations/modeling',
    'visualizations/clustering',
    'models',
    'reports',
    'config',
    'src'
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)
    print(f"✓ Created/Verified folder: {folder}")

print("\n" + "="*80)
print("PROJECT SETUP COMPLETE")
print("="*80)
print(f"\nAll folders created in: {project_root}")
print("\nFolder structure verified:")
print("""
Airbnb_Analysis_Project/
├── data/
│   ├── raw/
│   └── processed/
├── visualizations/
│   ├── eda/
│   ├── modeling/
│   └── clustering/
├── models/
├── reports/
├── config/
└── src/
""")

Current notebook location: /Users/macbookpro/Documents/more_projects/Airbnb_Analysis_Project
Project root: /Users/macbookpro/Documents/more_projects/Airbnb_Analysis_Project
Working directory changed to: /Users/macbookpro/Documents/more_projects/Airbnb_Analysis_Project

✓ Created/Verified folder: data/raw
✓ Created/Verified folder: data/processed
✓ Created/Verified folder: visualizations/eda
✓ Created/Verified folder: visualizations/modeling
✓ Created/Verified folder: visualizations/clustering
✓ Created/Verified folder: models
✓ Created/Verified folder: reports
✓ Created/Verified folder: config
✓ Created/Verified folder: src

PROJECT SETUP COMPLETE

All folders created in: /Users/macbookpro/Documents/more_projects/Airbnb_Analysis_Project

Folder structure verified:

Airbnb_Analysis_Project/
├── data/
│   ├── raw/
│   └── processed/
├── visualizations/
│   ├── eda/
│   ├── modeling/
│   └── clustering/
├── models/
├── reports/
├── config/
└── src/



In [7]:
df = pd.read_csv('data/raw/MinoAI_dataset.csv')

print(f"\n✓ Dataset loaded successfully!")
print(f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")


✓ Dataset loaded successfully!
Shape: 48,895 rows × 16 columns


In [8]:
# ==================== CELL 2: MISSING VALUES ANALYSIS ====================
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing.index,
    'Missing_Count': missing.values,
    'Missing_Percentage': missing_pct.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

print("\nMissing Values Summary:")
print(missing_df.to_string(index=False))

# SAVE: Missing values visualization
fig, ax = plt.subplots(figsize=(10, 6))
missing_df_sorted = missing_df.sort_values('Missing_Percentage', ascending=True)
ax.barh(missing_df_sorted['Column'], missing_df_sorted['Missing_Percentage'], color='coral', edgecolor='black')
ax.set_xlabel('Percentage of Missing Data (%)', fontsize=11, fontweight='bold')
ax.set_title('Missing Values Analysis', fontsize=13, fontweight='bold')
ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('visualizations/eda/01_missing_values.png', dpi=300, bbox_inches='tight')
print("\n✓ Saved: visualizations/eda/01_missing_values.png")
plt.close()


Missing Values Summary:
           Column  Missing_Count  Missing_Percentage
reviews_per_month          10052           20.558339
      last_review          10052           20.558339
        host_name             21            0.042949
             name             16            0.032723

✓ Saved: visualizations/eda/01_missing_values.png


In [9]:
# ==================== CELL 3: DESCRIPTIVE STATISTICS ====================
print("\nDescriptive Statistics Summary:")
print(df.describe().round(2))



Descriptive Statistics Summary:
                id       host_id  latitude  longitude     price  \
count     48895.00  4.889500e+04  48895.00   48895.00  48895.00   
mean   19017143.24  6.762001e+07     40.73     -73.95    152.72   
std    10983108.39  7.861097e+07      0.05       0.05    240.15   
min        2539.00  2.438000e+03     40.50     -74.24      0.00   
25%     9471945.00  7.822033e+06     40.69     -73.98     69.00   
50%    19677284.00  3.079382e+07     40.72     -73.96    106.00   
75%    29152178.50  1.074344e+08     40.76     -73.94    175.00   
max    36487245.00  2.743213e+08     40.91     -73.71  10000.00   

       minimum_nights  number_of_reviews  reviews_per_month  \
count        48895.00           48895.00           38843.00   
mean             7.03              23.27               1.37   
std             20.51              44.55               1.68   
min              1.00               0.00               0.01   
25%              1.00               1.00        

In [10]:
# ==================== CELL 4: DATA CLEANING ====================
df_clean = df.copy()

# Remove invalid prices
initial_rows = len(df_clean)
df_clean = df_clean[df_clean['price'] > 0]
print(f"\nData Cleaning:")
print(f"  Removed {initial_rows - len(df_clean)} rows with invalid prices")

# Remove negative minimum nights
initial_rows = len(df_clean)
df_clean = df_clean[df_clean['minimum_nights'] >= 0]
print(f"  Removed {initial_rows - len(df_clean)} rows with negative minimum nights")

# Fill missing reviews_per_month
median_rpm = df_clean['reviews_per_month'].median()
df_clean['reviews_per_month'].fillna(median_rpm, inplace=True)

# Fill missing last_review
df_clean['last_review'].fillna('No Review', inplace=True)

print(f"  Cleaned dataset: {df_clean.shape[0]:,} rows")

# Save cleaned data
df_clean.to_csv('data/processed/data_cleaned.csv', index=False)
print(f"\n✓ Saved: data/processed/data_cleaned.csv")




Data Cleaning:
  Removed 11 rows with invalid prices
  Removed 0 rows with negative minimum nights
  Cleaned dataset: 48,884 rows

✓ Saved: data/processed/data_cleaned.csv


In [11]:
# ==================== CELL 5: UNIVARIATE ANALYSIS ====================
print("\nGenerating Univariate Analysis Visualizations...")

fig = plt.figure(figsize=(16, 12))

# Price distributions
ax1 = plt.subplot(3, 3, 1)
ax1.hist(df_clean['price'], bins=100, color='steelblue', edgecolor='black', alpha=0.7)
ax1.set_title('Price Distribution (All Data)', fontweight='bold')
ax1.set_xlabel('Price ($)')
ax1.set_ylabel('Frequency')
ax1.axvline(df_clean['price'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: ${df_clean["price"].mean():.0f}')
ax1.axvline(df_clean['price'].median(), color='green', linestyle='--', linewidth=2, label=f'Median: ${df_clean["price"].median():.0f}')
ax1.legend()

ax2 = plt.subplot(3, 3, 2)
price_filtered = df_clean[df_clean['price'] <= df_clean['price'].quantile(0.95)]
ax2.hist(price_filtered['price'], bins=50, color='coral', edgecolor='black', alpha=0.7)
ax2.set_title('Price Distribution (Excluding Top 5%)', fontweight='bold')
ax2.set_xlabel('Price ($)')
ax2.set_ylabel('Frequency')

ax3 = plt.subplot(3, 3, 3)
ax3.boxplot(df_clean['price'], vert=True)
ax3.set_title('Price Box Plot', fontweight='bold')
ax3.set_ylabel('Price ($)')
ax3.grid(alpha=0.3)

# Minimum nights
ax4 = plt.subplot(3, 3, 4)
min_nights_filtered = df_clean[df_clean['minimum_nights'] <= df_clean['minimum_nights'].quantile(0.95)]
ax4.hist(min_nights_filtered['minimum_nights'], bins=50, color='lightgreen', edgecolor='black', alpha=0.7)
ax4.set_title('Minimum Nights (Excluding Top 5%)', fontweight='bold')
ax4.set_xlabel('Days')
ax4.set_ylabel('Frequency')

# Number of reviews
ax5 = plt.subplot(3, 3, 5)
ax5.hist(df_clean['number_of_reviews'], bins=50, color='purple', edgecolor='black', alpha=0.7)
ax5.set_title('Number of Reviews Distribution', fontweight='bold')
ax5.set_xlabel('Number of Reviews')
ax5.set_ylabel('Frequency')

# Reviews per month
ax6 = plt.subplot(3, 3, 6)
rpm_data = df_clean['reviews_per_month'].dropna()
ax6.hist(rpm_data, bins=50, color='teal', edgecolor='black', alpha=0.7)
ax6.set_title('Reviews Per Month Distribution', fontweight='bold')
ax6.set_xlabel('Reviews/Month')
ax6.set_ylabel('Frequency')

# Availability
ax7 = plt.subplot(3, 3, 7)
ax7.hist(df_clean['availability_365'], bins=50, color='darkgreen', edgecolor='black', alpha=0.7)
ax7.set_title('Availability (365 Days)', fontweight='bold')
ax7.set_xlabel('Days Available')
ax7.set_ylabel('Frequency')

# Host listings count
ax8 = plt.subplot(3, 3, 8)
host_count_filtered = df_clean[df_clean['calculated_host_listings_count'] <= df_clean['calculated_host_listings_count'].quantile(0.95)]
ax8.hist(host_count_filtered['calculated_host_listings_count'], bins=50, color='darkred', edgecolor='black', alpha=0.7)
ax8.set_title('Host Listings Count (Excluding Top 5%)', fontweight='bold')
ax8.set_xlabel('Number of Listings')
ax8.set_ylabel('Frequency')

# Room type
ax9 = plt.subplot(3, 3, 9)
room_counts = df_clean['room_type'].value_counts()
ax9.bar(range(len(room_counts)), room_counts.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A'], edgecolor='black')
ax9.set_xticks(range(len(room_counts)))
ax9.set_xticklabels(room_counts.index, rotation=45, ha='right')
ax9.set_title('Room Type Distribution', fontweight='bold')
ax9.set_ylabel('Count')

plt.tight_layout()
plt.savefig('visualizations/eda/02_univariate_analysis.png', dpi=300, bbox_inches='tight')
print("✓ Saved: visualizations/eda/02_univariate_analysis.png")
plt.close()



Generating Univariate Analysis Visualizations...
✓ Saved: visualizations/eda/02_univariate_analysis.png


In [12]:
# ==================== CELL 6: BIVARIATE ANALYSIS ====================
print("\nGenerating Bivariate Analysis Visualizations...")

fig = plt.figure(figsize=(16, 12))

# Price by room type
ax1 = plt.subplot(2, 3, 1)
df_clean.boxplot(column='price', by='room_type', ax=ax1)
ax1.set_title('Price Distribution by Room Type', fontweight='bold')
ax1.set_xlabel('Room Type')
ax1.set_ylabel('Price ($)')
ax1.get_figure().suptitle('')

# Price by neighbourhood
ax2 = plt.subplot(2, 3, 2)
df_clean.boxplot(column='price', by='neighbourhood_group', ax=ax2)
ax2.set_title('Price Distribution by Neighbourhood Group', fontweight='bold')
ax2.set_xlabel('Neighbourhood Group')
ax2.set_ylabel('Price ($)')
ax2.get_figure().suptitle('')

# Price vs minimum nights
ax3 = plt.subplot(2, 3, 3)
sample = df_clean.sample(min(5000, len(df_clean)))
scatter = ax3.scatter(sample['minimum_nights'], sample['price'], alpha=0.4, s=20, c=sample['availability_365'], cmap='viridis')
ax3.set_title('Price vs Minimum Nights', fontweight='bold')
ax3.set_xlabel('Minimum Nights')
ax3.set_ylabel('Price ($)')
ax3.set_xlim(0, 500)
ax3.set_ylim(0, 500)
plt.colorbar(scatter, ax=ax3, label='Availability')

# Price vs reviews per month
ax4 = plt.subplot(2, 3, 4)
sample_rpm = df_clean[df_clean['reviews_per_month'].notna()].sample(min(5000, len(df_clean)))
ax4.scatter(sample_rpm['reviews_per_month'], sample_rpm['price'], alpha=0.4, s=20, color='orange')
ax4.set_title('Price vs Reviews Per Month', fontweight='bold')
ax4.set_xlabel('Reviews/Month')
ax4.set_ylabel('Price ($)')
ax4.set_ylim(0, 500)

# Price vs host listings
ax5 = plt.subplot(2, 3, 5)
sample_host = df_clean[df_clean['calculated_host_listings_count'] <= 50].sample(min(5000, len(df_clean)))
ax5.scatter(sample_host['calculated_host_listings_count'], sample_host['price'], alpha=0.4, s=20, color='green')
ax5.set_title('Price vs Host Listings Count', fontweight='bold')
ax5.set_xlabel('Host Listings Count')
ax5.set_ylabel('Price ($)')
ax5.set_ylim(0, 500)

# Geographic distribution
ax6 = plt.subplot(2, 3, 6)
scatter_geo = ax6.scatter(df_clean['longitude'], df_clean['latitude'], alpha=0.3, s=5, c=df_clean['price'], cmap='plasma')
ax6.set_title('Geographic Distribution (Colored by Price)', fontweight='bold')
ax6.set_xlabel('Longitude')
ax6.set_ylabel('Latitude')
plt.colorbar(scatter_geo, ax=ax6, label='Price ($)')

plt.tight_layout()
plt.savefig('visualizations/eda/03_bivariate_analysis.png', dpi=300, bbox_inches='tight')
print("✓ Saved: visualizations/eda/03_bivariate_analysis.png")
plt.close()




Generating Bivariate Analysis Visualizations...
✓ Saved: visualizations/eda/03_bivariate_analysis.png


In [13]:
# ==================== CELL 7: CORRELATION ANALYSIS ====================
print("\nGenerating Correlation Analysis Visualization...")

numeric_df = df_clean.select_dtypes(include=[np.number])
correlation_matrix = numeric_df.corr()

fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', 
            square=True, linewidths=0.5, cbar_kws={'label': 'Correlation'}, ax=ax)
ax.set_title('Correlation Matrix - All Numeric Variables', fontweight='bold', fontsize=13)
plt.tight_layout()
plt.savefig('visualizations/eda/04_correlation_heatmap.png', dpi=300, bbox_inches='tight')
print("✓ Saved: visualizations/eda/04_correlation_heatmap.png")
plt.close()



Generating Correlation Analysis Visualization...
✓ Saved: visualizations/eda/04_correlation_heatmap.png


In [14]:
# ==================== CELL 8: FEATURE ENGINEERING ====================
print("\nPerforming Feature Engineering...")

df_engineered = df_clean.copy()

# Feature 1: Has reviews
df_engineered['has_reviews'] = (df_engineered['number_of_reviews'] > 0).astype(int)

# Feature 2: Days since review
df_engineered['last_review_date'] = pd.to_datetime(df_engineered['last_review'], errors='coerce')
reference_date = pd.Timestamp('2024-12-31')
df_engineered['days_since_review'] = (reference_date - df_engineered['last_review_date']).dt.days
df_engineered['days_since_review'].fillna(-1, inplace=True)

# Feature 3: Price category
price_bins = [0, 50, 100, 200, 500, float('inf')]
price_labels = ['Budget', 'Economy', 'Mid-Range', 'Premium', 'Luxury']
df_engineered['price_category'] = pd.cut(df_engineered['price'], bins=price_bins, labels=price_labels, include_lowest=True)

# Feature 4: Availability level
df_engineered['availability_level'] = pd.cut(df_engineered['availability_365'],
                                              bins=[0, 30, 100, 300, 365],
                                              labels=['Low', 'Medium-Low', 'Medium-High', 'High'],
                                              include_lowest=True)

# Feature 5: Host experience
df_engineered['host_experience'] = pd.cut(df_engineered['calculated_host_listings_count'],
                                           bins=[0, 1, 3, 10, float('inf')],
                                           labels=['New', 'Growing', 'Established', 'Power'],
                                           include_lowest=True)

# Feature 6: Popularity score
df_engineered['popularity_score'] = df_engineered['number_of_reviews'] / (df_engineered['calculated_host_listings_count'] + 1)

# Feature 7: Room type encoded
room_type_map = {room: idx for idx, room in enumerate(sorted(df_engineered['room_type'].unique()))}
df_engineered['room_type_encoded'] = df_engineered['room_type'].map(room_type_map)

# Feature 8: Is entire home
df_engineered['is_entire_home'] = (df_engineered['room_type'] == 'Entire home/apt').astype(int)

# Feature 9: Host multi-listing
df_engineered['host_multi_listing'] = (df_engineered['calculated_host_listings_count'] > 1).astype(int)

# Feature 10: Neighbourhood encoded
neighbourhood_map = {ng: idx for idx, ng in enumerate(df_engineered['neighbourhood_group'].unique())}
df_engineered['neighbourhood_encoded'] = df_engineered['neighbourhood_group'].map(neighbourhood_map)

print(f"✓ Created 10 engineered features")
print(f"  Original columns: {df_clean.shape[1]}")
print(f"  New columns: {df_engineered.shape[1]}")

# Save engineered data
df_engineered.to_csv('data/processed/data_engineered.csv', index=False)
print(f"✓ Saved: data/processed/data_engineered.csv")



Performing Feature Engineering...
✓ Created 10 engineered features
  Original columns: 16
  New columns: 27
✓ Saved: data/processed/data_engineered.csv


In [15]:
# ==================== CELL 9: MODELING PREPARATION ====================
print("\nPreparing Data for Modeling...")

modeling_features = ['minimum_nights', 'number_of_reviews', 'availability_365', 
                     'reviews_per_month', 'calculated_host_listings_count', 'latitude', 'longitude',
                     'room_type_encoded', 'neighbourhood_encoded', 'has_reviews', 
                     'days_since_review', 'popularity_score', 'is_entire_home', 'host_multi_listing']

df_model = df_engineered[modeling_features + ['price']].dropna()

X = df_model[modeling_features]
y = df_model['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"✓ Data prepared: {X_train.shape[0]:,} training, {X_test.shape[0]:,} test samples")



Preparing Data for Modeling...
✓ Data prepared: 39,107 training, 9,777 test samples


In [16]:
# ==================== CELL 10: BUILD MODELS ====================
print("\nBuilding Predictive Models...")

model_results = {}

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)
y_pred_test_lr = lr_model.predict(X_test_scaled)
r2_test_lr = r2_score(y_test, y_pred_test_lr)
rmse_test_lr = np.sqrt(mean_squared_error(y_test, y_pred_test_lr))
mae_test_lr = mean_absolute_error(y_test, y_pred_test_lr)

model_results['Linear Regression'] = {
    'Test_R2': r2_test_lr,
    'Test_RMSE': rmse_test_lr,
    'Test_MAE': mae_test_lr,
    'Predictions': y_pred_test_lr
}

# Save Linear Regression model
with open('models/linear_regression_model.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

print(f"✓ Linear Regression - R²: {r2_test_lr:.4f}")
print(f"  Saved: models/linear_regression_model.pkl")

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)
y_pred_test_rf = rf_model.predict(X_test)
r2_test_rf = r2_score(y_test, y_pred_test_rf)
rmse_test_rf = np.sqrt(mean_squared_error(y_test, y_pred_test_rf))
mae_test_rf = mean_absolute_error(y_test, y_pred_test_rf)

model_results['Random Forest'] = {
    'Test_R2': r2_test_rf,
    'Test_RMSE': rmse_test_rf,
    'Test_MAE': mae_test_rf,
    'Predictions': y_pred_test_rf,
    'Model': rf_model
}

print(f"✓ Random Forest - R²: {r2_test_rf:.4f}")

# Gradient Boosting
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_test_gb = gb_model.predict(X_test)
r2_test_gb = r2_score(y_test, y_pred_test_gb)
rmse_test_gb = np.sqrt(mean_squared_error(y_test, y_pred_test_gb))
mae_test_gb = mean_absolute_error(y_test, y_pred_test_gb)

model_results['Gradient Boosting'] = {
    'Test_R2': r2_test_gb,
    'Test_RMSE': rmse_test_gb,
    'Test_MAE': mae_test_gb,
    'Predictions': y_pred_test_gb,
    'Model': gb_model
}

print(f"✓ Gradient Boosting - R²: {r2_test_gb:.4f}")

# Select best model
best_model_name = max(model_results, key=lambda x: model_results[x]['Test_R2'])
print(f"\n✓ Best Model: {best_model_name}")

# Save best model
best_model_obj = model_results[best_model_name]['Model']
with open(f'models/best_model_{best_model_name.lower().replace(" ", "_")}.pkl', 'wb') as f:
    pickle.dump(best_model_obj, f)
print(f"  Saved: models/best_model_{best_model_name.lower().replace(' ', '_')}.pkl")

# Save all models metadata
model_metadata = {
    'best_model': best_model_name,
    'all_models_performance': comparison_df.to_dict() if 'comparison_df' in locals() else model_results,
    'feature_names': list(X.columns),
    'scaler': scaler,
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')
}

with open('models/model_metadata.pkl', 'wb') as f:
    pickle.dump(model_metadata, f)
print(f"  Saved: models/model_metadata.pkl")




Building Predictive Models...
✓ Linear Regression - R²: 0.1251
  Saved: models/linear_regression_model.pkl
✓ Random Forest - R²: 0.1673
✓ Gradient Boosting - R²: 0.1967

✓ Best Model: Gradient Boosting
  Saved: models/best_model_gradient_boosting.pkl
  Saved: models/model_metadata.pkl


In [17]:
# ==================== CELL 11: MODEL COMPARISON VISUALIZATION ====================
print("\nGenerating Model Comparison Visualizations...")

comparison_df = pd.DataFrame({
    'Model': model_results.keys(),
    'Test_R²': [model_results[m]['Test_R2'] for m in model_results.keys()],
    'Test_RMSE': [model_results[m]['Test_RMSE'] for m in model_results.keys()],
    'Test_MAE': [model_results[m]['Test_MAE'] for m in model_results.keys()]
})

fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# R² comparison
axes[0].bar(comparison_df['Model'], comparison_df['Test_R²'], color=['#FF6B6B', '#4ECDC4', '#45B7D1'], edgecolor='black')
axes[0].set_title('Model Comparison - R² Score', fontweight='bold')
axes[0].set_ylabel('R² Score')
axes[0].set_ylim(0, 1)
for i, v in enumerate(comparison_df['Test_R²']):
    axes[0].text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

# RMSE comparison
axes[1].bar(comparison_df['Model'], comparison_df['Test_RMSE'], color=['#FF6B6B', '#4ECDC4', '#45B7D1'], edgecolor='black')
axes[1].set_title('Model Comparison - RMSE', fontweight='bold')
axes[1].set_ylabel('RMSE ($)')
for i, v in enumerate(comparison_df['Test_RMSE']):
    axes[1].text(i, v + 5, f'${v:.0f}', ha='center', fontweight='bold')

# MAE comparison
axes[2].bar(comparison_df['Model'], comparison_df['Test_MAE'], color=['#FF6B6B', '#4ECDC4', '#45B7D1'], edgecolor='black')
axes[2].set_title('Model Comparison - MAE', fontweight='bold')
axes[2].set_ylabel('MAE ($)')
for i, v in enumerate(comparison_df['Test_MAE']):
    axes[2].text(i, v + 2, f'${v:.0f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('visualizations/modeling/01_model_comparison.png', dpi=300, bbox_inches='tight')
print("✓ Saved: visualizations/modeling/01_model_comparison.png")
plt.close()



Generating Model Comparison Visualizations...
✓ Saved: visualizations/modeling/01_model_comparison.png


In [18]:
# ==================== CELL 12: BEST MODEL EVALUATION ====================
print("\nGenerating Best Model Evaluation Visualizations...")

if best_model_name == 'Gradient Boosting':
    y_pred_best = y_pred_test_gb
    feature_imp = gb_model.feature_importances_
elif best_model_name == 'Random Forest':
    y_pred_best = y_pred_test_rf
    feature_imp = rf_model.feature_importances_
else:
    y_pred_best = y_pred_test_lr

# Prediction vs Actual
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot
axes[0].scatter(y_test, y_pred_best, alpha=0.5, s=20, color='steelblue')
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_title(f'{best_model_name} - Predicted vs Actual\nR² = {model_results[best_model_name]["Test_R2"]:.4f}', fontweight='bold')
axes[0].set_xlabel('Actual Price ($)')
axes[0].set_ylabel('Predicted Price ($)')
axes[0].set_xlim(0, 500)
axes[0].set_ylim(0, 500)
axes[0].grid(alpha=0.3)

# Residuals
residuals = y_test - y_pred_best
axes[1].scatter(y_pred_best, residuals, alpha=0.5, s=20, color='orange')
axes[1].axhline(y=0, color='r', linestyle='--', lw=2)
axes[1].set_title('Residuals Analysis', fontweight='bold')
axes[1].set_xlabel('Predicted Price ($)')
axes[1].set_ylabel('Residuals ($)')
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('visualizations/modeling/02_predictions_vs_actual.png', dpi=300, bbox_inches='tight')
print("✓ Saved: visualizations/modeling/02_predictions_vs_actual.png")
plt.close()



Generating Best Model Evaluation Visualizations...
✓ Saved: visualizations/modeling/02_predictions_vs_actual.png


In [19]:
# ==================== CELL 13: FEATURE IMPORTANCE ====================
if best_model_name in ['Random Forest', 'Gradient Boosting']:
    print("\nGenerating Feature Importance Visualization...")
    
    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': feature_imp
    }).sort_values('Importance', ascending=False)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    top_features = feature_importance_df.head(15)
    ax.barh(top_features['Feature'], top_features['Importance'], color='steelblue', edgecolor='black')
    ax.set_xlabel('Importance Score', fontsize=11, fontweight='bold')
    ax.set_title(f'Top 15 Feature Importance - {best_model_name}', fontsize=13, fontweight='bold')
    ax.invert_yaxis()
    plt.tight_layout()
    plt.savefig('visualizations/modeling/03_feature_importance.png', dpi=300, bbox_inches='tight')
    print("✓ Saved: visualizations/modeling/03_feature_importance.png")
    plt.close()



Generating Feature Importance Visualization...
✓ Saved: visualizations/modeling/03_feature_importance.png


In [20]:
# ==================== CELL 14: CLUSTERING ANALYSIS ====================
print("\nPerforming Clustering Analysis...")

clustering_features = ['price', 'minimum_nights', 'number_of_reviews', 'availability_365', 'reviews_per_month']
cluster_data = df_engineered[clustering_features].dropna()

scaler_cluster = StandardScaler()
cluster_scaled = scaler_cluster.fit_transform(cluster_data)

# Elbow method
inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(cluster_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(cluster_scaled, labels))

# Plot clustering metrics
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0].set_title('Elbow Method', fontweight='bold')
axes[0].set_xlabel('Number of Clusters (K)')
axes[0].set_ylabel('Inertia')
axes[0].grid(True, alpha=0.3)

axes[1].plot(K_range, silhouette_scores, 'go-', linewidth=2, markersize=8)
axes[1].set_title('Silhouette Score (Higher Better)', fontweight='bold')
axes[1].set_xlabel('Number of Clusters (K)')
axes[1].set_ylabel('Silhouette Score')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('visualizations/clustering/01_elbow_and_silhouette.png', dpi=300, bbox_inches='tight')
print("✓ Saved: visualizations/clustering/01_elbow_and_silhouette.png")
plt.close()

# Apply K-means with optimal K
optimal_k = 3
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans_final.fit_predict(cluster_scaled)

cluster_data['Cluster'] = cluster_labels

# Cluster characteristics visualization
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, feature in enumerate(clustering_features):
    ax = axes[idx]
    for cluster_id in range(optimal_k):
        cluster_mask = cluster_data['Cluster'] == cluster_id
        data = cluster_data[cluster_mask][feature]
        ax.hist(data, alpha=0.6, bins=30, label=f'Cluster {cluster_id}')
    ax.set_title(f'{feature} by Cluster', fontweight='bold')
    ax.set_xlabel(feature)
    ax.set_ylabel('Frequency')
    ax.legend()

# Remove extra subplot
axes[-1].remove()

plt.tight_layout()
plt.savefig('visualizations/clustering/02_cluster_distributions.png', dpi=300, bbox_inches='tight')
print("✓ Saved: visualizations/clustering/02_cluster_distributions.png")
plt.close()

print("\n" + "="*80)
print("ANALYSIS COMPLETE!")
print("="*80)

print("\nGENERATED FILES:")
print("\nData Files:")
print("  ✓ data/raw/MinoAI dataset.csv (original)")
print("  ✓ data/processed/data_cleaned.csv")
print("  ✓ data/processed/data_engineered.csv")

print("\nVisualizations (EDA):")
print("  ✓ visualizations/eda/01_missing_values.png")
print("  ✓ visualizations/eda/02_univariate_analysis.png")
print("  ✓ visualizations/eda/03_bivariate_analysis.png")
print("  ✓ visualizations/eda/04_correlation_heatmap.png")

print("\nVisualizations (Modeling):")
print("  ✓ visualizations/modeling/01_model_comparison.png")
print("  ✓ visualizations/modeling/02_predictions_vs_actual.png")
print("  ✓ visualizations/modeling/03_feature_importance.png")

print("\nVisualizations (Clustering):")
print("  ✓ visualizations/clustering/01_elbow_and_silhouette.png")
print("  ✓ visualizations/clustering/02_cluster_distributions.png")

print("\nTrained Models Saved:")
print("  ✓ models/linear_regression_model.pkl")
print("  ✓ models/random_forest_model.pkl")
print("  ✓ models/gradient_boosting_model.pkl")
print(f"  ✓ models/best_model_{best_model_name.lower().replace(' ', '_')}.pkl")
print("  ✓ models/model_metadata.pkl")

print("\nSUMMARY STATISTICS:")
print(f"  Best Model: {best_model_name}")
print(f"  Test R² Score: {model_results[best_model_name]['Test_R2']:.4f}")
print(f"  Test RMSE: ${model_results[best_model_name]['Test_RMSE']:.2f}")
print(f"  Test MAE: ${model_results[best_model_name]['Test_MAE']:.2f}")
print(f"  Optimal Clusters: {optimal_k}")
print(f"\n✓ ALL OBJECTIVES COMPLETED SUCCESSFULLY!")


Performing Clustering Analysis...
✓ Saved: visualizations/clustering/01_elbow_and_silhouette.png
✓ Saved: visualizations/clustering/02_cluster_distributions.png

ANALYSIS COMPLETE!

GENERATED FILES:

Data Files:
  ✓ data/raw/MinoAI dataset.csv (original)
  ✓ data/processed/data_cleaned.csv
  ✓ data/processed/data_engineered.csv

Visualizations (EDA):
  ✓ visualizations/eda/01_missing_values.png
  ✓ visualizations/eda/02_univariate_analysis.png
  ✓ visualizations/eda/03_bivariate_analysis.png
  ✓ visualizations/eda/04_correlation_heatmap.png

Visualizations (Modeling):
  ✓ visualizations/modeling/01_model_comparison.png
  ✓ visualizations/modeling/02_predictions_vs_actual.png
  ✓ visualizations/modeling/03_feature_importance.png

Visualizations (Clustering):
  ✓ visualizations/clustering/01_elbow_and_silhouette.png
  ✓ visualizations/clustering/02_cluster_distributions.png

Trained Models Saved:
  ✓ models/linear_regression_model.pkl
  ✓ models/random_forest_model.pkl
  ✓ models/gradie