## Preprocessing

In [1]:
#Importing Libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
import os
import sys
from dotenv import load_dotenv

project_root_path = r"C:\Users\boluy\Downloads\TERM 03\AML 3303\projects\Staywise-Airbnb-Pricing" 
dotenv_path = os.path.join(project_root_path, '.env')

if project_root_path and project_root_path not in sys.path:
    sys.path.insert(0, project_root_path)
    print(f" Path Fix: Project Root added to sys.path: {project_root_path}")
else:
    print(" Path Fix: Project Root already set.")


# 2. LOAD ENVIRONMENT VARIABLES & FORCING S3 PATH, to ensures the correct S3_PATH is always used

if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path)
    os.environ["S3_PATH"] = "s3://my-airbnb-pricing/AB_NYC_2019.csv"
    print(" Environment: S3_PATH forced to correct value.")
else:
    print(f" Environment ERROR: Cannot find .env file at {dotenv_path}.")


# 3. Import Modules 
import src.data as data
import src.process as process # Using 'process' as agreed upon

import pandas as pd
from sklearn.model_selection import train_test_split
import joblib

print("\n All imports successful! Ready for data loading.")

 Path Fix: Project Root added to sys.path: C:\Users\boluy\Downloads\TERM 03\AML 3303\projects\Staywise-Airbnb-Pricing
 Environment: S3_PATH forced to correct value.

 All imports successful! Ready for data loading.


In [3]:
# Import and Test Data Loading

try:
    from src.data import load_from_s3
    
    # Load data from S3
    df = load_from_s3()

    print("\n Data Loading SUCCESS!")
    print(f"Loaded {len(df)} rows from S3.")
   
except Exception as e:
    print(f"\n ERROR during data loading: {e}")


 Data Loading SUCCESS!
Loaded 48895 rows from S3.


###  Feature Engineering


In [4]:
# Distance from NYC centre: Geographic features that significantly influence prices.
times_sq_lat, times_sq_lon = 40.7580, -73.9855
df["distance_to_center"] = np.sqrt(
    (df["latitude"] - times_sq_lat)**2 +
    (df["longitude"] - times_sq_lon)**2
)

# Room type flags
df["is_private_room"] = (df["room_type"] == "Private room").astype(int)
df["is_shared_room"] = (df["room_type"] == "Shared room").astype(int)
df["is_entire_home"] = (df["room_type"] == "Entire home/apt").astype(int)

# Date features
df["last_review"] = pd.to_datetime(df["last_review"])
df["days_since_last_review"] = (pd.Timestamp("today") - df["last_review"]).dt.days
df["last_review_month"] = df["last_review"].dt.month
df["last_review_year"] = df["last_review"].dt.year

df["days_since_last_review"] = df["days_since_last_review"].fillna(df["days_since_last_review"].max())
df["reviews_per_month"] = df["reviews_per_month"].fillna(0)

# Host features
df["host_more_than_one_listing"] = (df["calculated_host_listings_count"] > 1).astype(int)
df["host_many_listings"] = (df["calculated_host_listings_count"] >= 5).astype(int)

# Price transform
df["log_price"] = np.log1p(df["price"])

# Demand features
df["is_high_availability"] = (df["availability_365"] > 180).astype(int)
df["is_popular"] = (df["number_of_reviews"] > df["number_of_reviews"].median()).astype(int)

# Interaction features
df["reviews_x_minimum_nights"] = df["number_of_reviews"] * df["minimum_nights"]
df["availability_x_reviews"] = df["availability_365"] * df["reviews_per_month"]
df["room_latitude"] = df["latitude"] * (df["room_type"].astype("category").cat.codes)

# Save engineered dataset
df.to_parquet(r"C:\Users\boluy\Downloads\TERM 03\AML 3303\projects\Staywise-Airbnb-Pricing\data\sample_features.parquet", index=False)

df.head()


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,last_review_month,last_review_year,host_more_than_one_listing,host_many_listings,log_price,is_high_availability,is_popular,reviews_x_minimum_nights,availability_x_reviews,room_latitude
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,...,10.0,2018.0,1,1,5.010635,1,1,9,76.65,40.64749
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,...,5.0,2019.0,1,0,5.420535,1,1,45,134.9,0.0
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,...,,,0,0,5.01728,1,0,0,0.0,40.80902
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,...,7.0,2019.0,0,0,4.49981,1,1,270,900.16,0.0
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,...,11.0,2018.0,0,0,4.394449,0,1,90,0.0,0.0


## Preprocessing pipeline

In [5]:
import os
import sys
from dotenv import load_dotenv
import joblib
from sklearn.model_selection import train_test_split

project_root_path = r"C:\Users\boluy\Downloads\TERM 03\AML 3303\projects\Staywise-Airbnb-Pricing" 
dotenv_path = os.path.join(project_root_path, '.env')

if project_root_path not in sys.path:
    sys.path.insert(0, project_root_path)
    print(f" Path Fix: Project Root added to sys.path: {project_root_path}")
    
# Load Environment and Force S3 Path
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path)
    os.environ["S3_PATH"] = "s3://my-airbnb-pricing/AB_NYC_2019.csv"
    print(" Environment: S3_PATH forced to correct value.")

#  Imports 
import src.data as data
import src.process as process 
print("\n All core modules (src.data, src.process) imported.")


# Data Load, Clean, and Split ---
print("\n--- Starting Data Processing ---")
raw_df = data.load_from_s3()
processed_df = process.clean_and_feature_engineer(raw_df)

# Define features (X) and target (y)
X = processed_df.drop(columns=['price', 'log_price'])
y = processed_df['log_price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Data split successfully. X_train shape: {X_train.shape}")


# Build, Save, and Test Preprocessor

# a) Build and Fit Preprocessor
print("\nFitting preprocessor on X_train...")
preprocessor = process.build_preprocessor(X_train)
preprocessor.fit(X_train) # Explicitly fit the transformer

# b) Save Preprocessor
preprocessor_path = os.path.join(project_root_path, 'artifacts', 'airbnb_preprocessor.joblib')
os.makedirs(os.path.dirname(preprocessor_path), exist_ok=True)
process.save_preprocessor(preprocessor, preprocessor_path)
print(f" Preprocessor saved to: {preprocessor_path}")

# c) Transformation Tests
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print(f"\nFinal X_train transformed shape: {X_train_transformed.shape}")
print(f"Final X_test transformed shape: {X_test_transformed.shape}")

if X_train_transformed.shape[1] == X_test_transformed.shape[1]:
    print("\n FINAL SUCCESS: Preprocessing pipeline built and tested. Features match!")
else:
    print(" ERROR: Feature shapes do not match after transformation. Check your OHE categories.")

 Environment: S3_PATH forced to correct value.

 All core modules (src.data, src.process) imported.

--- Starting Data Processing ---
Data after cleaning has 48569 rows and 14 columns.
Data split successfully. X_train shape: (38855, 12)

Fitting preprocessor on X_train...
 Preprocessor saved to: C:\Users\boluy\Downloads\TERM 03\AML 3303\projects\Staywise-Airbnb-Pricing\artifacts\airbnb_preprocessor.joblib

Final X_train transformed shape: (38855, 238)
Final X_test transformed shape: (9714, 238)

 FINAL SUCCESS: Preprocessing pipeline built and tested. Features match!


## Save Sample Features

In [6]:


from src.data import load_from_s3
from src.process import clean_and_feature_engineer
import os

#  Load and Engineer the Data 
raw_df = load_from_s3()
engineered_df = clean_and_feature_engineer(raw_df)

#  Select Sample and Save 
# Create the target directory and path
output_dir = os.path.join(os.path.abspath(os.path.join(os.getcwd(), '..')), 'data')
output_file = os.path.join(output_dir, 'sample_features.parquet')

os.makedirs(output_dir, exist_ok=True)

# Select a small sample (e.g., 100 rows) and save it
sample_df = engineered_df.sample(n=100, random_state=42)

# Save the sample to the 'data' directory
sample_df.to_parquet(output_file, index=False)

print(f" Engineered dataset sample (100 rows) saved to: {output_file}")

Data after cleaning has 48569 rows and 14 columns.
 Engineered dataset sample (100 rows) saved to: C:\Users\boluy\Downloads\TERM 03\AML 3303\Assignment02-airbnb-pricing\data\sample_features.parquet
