<a href="https://colab.research.google.com/github/MangeshVR1546/Satellite-Imagery-Based-Property-Valuation-Project/blob/main/Data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from xgboost import XGBRegressor
import category_encoders as ce

# Load data
df = pd.read_csv('/content/tabular_Dataset.csv')

# Date & target processing
df['date'] = pd.to_datetime(df['date'])
df['sale_year'] = df['date'].dt.year
df['log_price'] = np.log1p(df['price'])
df['property_age'] = df['sale_year'] - df['yr_built']
df['is_renovated'] = (df['yr_renovated'] > 0).astype(int)

# Haversine distance
def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

# Spatial anchors
CITY_LAT, CITY_LON = 47.6062, -122.3321
WATER_LAT, WATER_LON = 47.60, -122.35
TECH_LAT, TECH_LON = 47.6740, -122.1215

# Distance features
df['log_dist_city'] = np.log1p(haversine(df['lat'], df['long'], CITY_LAT, CITY_LON))
df['log_dist_water'] = np.log1p(haversine(df['lat'], df['long'], WATER_LAT, WATER_LON))
df['log_dist_tech'] = np.log1p(haversine(df['lat'], df['long'], TECH_LAT, TECH_LON))

# Location clustering
df['location_cluster'] = KMeans(n_clusters=15, random_state=42).fit_predict(df[['lat','long']])

# Interaction feature
df['waterfront_size_effect'] = df['waterfront'] * df['sqft_living']

# Feature selection
feature_cols = [
    'bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront',
    'view','condition','grade','sqft_above','sqft_basement','property_age',
    'lat','long','sqft_living15','sqft_lot15','sale_year',
    'location_cluster','log_dist_city','log_dist_water','log_dist_tech','zipcode'
]

X = df[feature_cols]
y = df['log_price']

# Trainâ€“validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Target encoding for zipcode
zip_encoder = ce.TargetEncoder(cols=['zipcode'])
X_train_enc = zip_encoder.fit_transform(X_train, y_train)
X_val_enc = zip_encoder.transform(X_val)

X_full_enc = zip_encoder.transform(X)
X_full_enc = X_full_enc.rename(columns={'zipcode':'zipcode_encoded'})


df[X.columns] = X_full_enc

# Drop leakage and unused columns
df = df.drop(columns=['price','yr_renovated'])

joblib.dump(zip_encoder, 'zipcode_target_encoder.pkl')
df.to_csv('preprocessed_data.csv', index=False)
