In [1]:
# 1. Install the Kaggle Hub library
!pip install kagglehub --quiet

import kagglehub
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# --- STEP 1: DOWNLOAD DATA DIRECTLY FROM KAGGLE ---

print("Downloading dataset from Kaggle...")
path = kagglehub.dataset_download("dgomonov/new-york-city-airbnb-open-data")

print(f"Dataset downloaded to: {path}")

# Find the CSV file inside the downloaded folder

csv_file = os.path.join(path, "AB_NYC_2019.csv")

# 2. Load Data
df = pd.read_csv(csv_file)

# 3. Data Cleaning

df = df.drop(['id', 'name', 'host_id', 'host_name', 'last_review'], axis=1)

df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

df = df[(df['price'] > 0) & (df['price'] < 500)]

print(f"Data Loaded Successfully: {df.shape[0]} listings ready.")

# 4. Simple Pricing Analysis
print("\n--- Average Price by Neighbourhood Group ---")
display(df.groupby('neighbourhood_group')['price'].mean().sort_values(ascending=False))

# 5. Build the Recommendation Engine (Linear Regression)

model_data = pd.get_dummies(df, columns=['neighbourhood_group', 'room_type'], drop_first=True)


features = ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count']
features += [col for col in model_data.columns if 'neighbourhood_group_' in col or 'room_type_' in col]

X = model_data[features]
y = model_data['price']

# Split & Train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

print(f"\n--- Model Trained ---")
print(f"Accuracy (R2 Score): {r2_score(y_test, model.predict(X_test)):.2f}")
print("The model is ready to predict prices!")

# 6. Export for Tableau
df.to_csv('Airbnb_Cleaned_For_Dashboard.csv', index=False)
print("SUCCESS: 'Airbnb_Cleaned_For_Dashboard.csv' is ready to download for Tableau.")

Downloading dataset from Kaggle...
Downloading from https://www.kaggle.com/api/v1/datasets/download/dgomonov/new-york-city-airbnb-open-data?dataset_version_number=3...


100%|██████████| 2.44M/2.44M [00:00<00:00, 117MB/s]

Extracting files...
Dataset downloaded to: /root/.cache/kagglehub/datasets/dgomonov/new-york-city-airbnb-open-data/versions/3





Data Loaded Successfully: 47649 listings ready.

--- Average Price by Neighbourhood Group ---


Unnamed: 0_level_0,price
neighbourhood_group,Unnamed: 1_level_1
Manhattan,161.114141
Brooklyn,111.701958
Queens,92.344583
Staten Island,91.144414
Bronx,81.044403



--- Model Trained ---
Accuracy (R2 Score): 0.39
The model is ready to predict prices!
SUCCESS: 'Airbnb_Cleaned_For_Dashboard.csv' is ready to download for Tableau.
