In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

### Doing basic Feature Engineering and Feacture Extraction using PCA for model implementation and analysis

In [3]:
df = pd.read_csv("preprocessed_lahore_house_prices.csv")

## Price per Marla

In [4]:
df["Price_per_Marla"] = df["Price"] / df["Area_Marlas"]

## Location Tiering

In [5]:
# Calculate average price per location
location_avg = df.groupby("Location")["Price"].mean()

# Define tiers based on quantiles
high_thresh = location_avg.quantile(0.75)
low_thresh = location_avg.quantile(0.25)

def assign_location_tier(loc):
    avg_price = location_avg.get(loc, 0)
    if avg_price >= high_thresh:
        return "High-end"
    elif avg_price <= low_thresh:
        return "Low-end"
    else:
        return "Mid-range"

df["Location_Tier"] = df["Location"].apply(assign_location_tier)

## Size Category

In [6]:
def size_category(marla):
    if marla < 5:
        return "Small"
    elif marla <= 10:
        return "Medium"
    else:
        return "Large"

df["Size_Category"] = df["Area_Marlas"].apply(size_category)

## Luxury Indicator

In [7]:
df["Is_Luxury"] = ((df["Bedroom(s)"] > 5) & (df["Bath(s)"] > 5)).astype(int)

## Using one-hot encoding to convert categorical into numeric form.

In [11]:
df_model = pd.get_dummies(df, columns=["Type", "Location_Tier", "Size_Category"], drop_first=True)
df_model.head()

Unnamed: 0,house_id,Location,Area,Bath(s),Bedroom(s),Price,Area_Marlas,Price_per_Marla,Is_Luxury,Type_house,Location_Tier_Low-end,Location_Tier_Mid-range,Size_Category_Medium,Size_Category_Small
0,46326643,"dha defence, lahore, punjab",1 Kanal,6,5,75500000,20.0,3775000.0,0,True,False,False,False,False
1,46952582,"bahria orchard, lahore, punjab",8 Marla,5,5,25000000,8.0,3125000.0,0,True,True,False,True,False
2,47357581,"paragon city, lahore, punjab",10 Marla,6,5,47000000,10.0,4700000.0,0,True,False,True,True,False
3,47397157,"askari, lahore, punjab",10 Marla,3,3,28000000,10.0,2800000.0,0,False,False,False,True,False
4,43109993,"dha defence, lahore, punjab",8 Marla,5,4,43500000,8.0,5437500.0,0,True,False,False,True,False


## Dividing into X and y for model implementation

In [9]:
X = df_model.drop(columns=["house_id", "Price", "Location", "Area"])  # Drop raw features
y = df_model["Price"]

X.to_csv("features_X.csv", index=False)
y.to_csv("target_y.csv", index=False)

# PCA Feature Extraction
### Feature Extraction (Bonus Section)

In addition to manual feature engineering, we perform feature extraction using **Principal Component Analysis (PCA)**. This technique helps reduce high-dimensional data into principal components that summarize the key patterns in the dataset.

We extract the top 3 components and include them in our modeling to capture hidden structure that may not be apparent in raw or engineered features.


In [10]:
# Standardize the features before PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=3)  # Just extract top 3 components
X_pca = pca.fit_transform(X_scaled)


pca_df.to_csv("pca_df.csv", index=False)

NameError: name 'pca_df' is not defined