In [1]:
import numpy as np
import pandas as pd
from typing import Optional, List
from sklearn.model_selection import train_test_split

import sklearn.base
# import standartScaler
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
seed = 24

data = pd.read_csv('/Users/melodiz/projects/ML_SHAD/Linear_Models/data.csv')
target_column = "Sale_Price"
np.random.seed(seed)

In [2]:
test_size = 0.2
data_train, data_test, Y_train, Y_test = train_test_split(
    data[data.columns.drop("Sale_Price")],
    np.array(data["Sale_Price"]),
    test_size=test_size,
    random_state=seed)

print(f"Train : {data_train.shape} {Y_train.shape}")
print(f"Test : {data_test.shape} {Y_test.shape}")

continuous_columns = [key for key in data.keys(
) if data[key].dtype in ("int64", "float64")]
categorical_columns = [
    key for key in data.keys() if data[key].dtype == "object"]

continuous_columns.remove(target_column)

print(f"Continuous : {len(continuous_columns)}, Categorical : {len(categorical_columns)}")


Train : (226, 80) (226,)
Test : (57, 80) (57,)
Continuous : 34, Categorical : 46


In [5]:
data.loc[:, continuous_columns].head(5)

Unnamed: 0,Lot_Frontage,Lot_Area,Year_Built,Year_Remod_Add,Mas_Vnr_Area,BsmtFin_SF_1,BsmtFin_SF_2,Bsmt_Unf_SF,Total_Bsmt_SF,First_Flr_SF,...,Open_Porch_SF,Enclosed_Porch,Three_season_porch,Screen_Porch,Pool_Area,Misc_Val,Mo_Sold,Year_Sold,Longitude,Latitude
0,141,31770,1960,1960,112,2,0,441,1080,1656,...,62,0,0,0,0,0,5,2010,-93.619754,42.054035
1,80,11622,1961,1961,0,6,144,270,882,896,...,0,0,0,120,0,0,6,2010,-93.619756,42.053014
2,81,14267,1958,1958,108,1,0,406,1329,1329,...,36,0,0,0,0,12500,6,2010,-93.619387,42.052659
3,93,11160,1968,1968,0,1,0,1045,2110,2110,...,0,0,0,0,0,0,4,2010,-93.61732,42.051245
4,74,13830,1997,1998,0,3,0,137,928,928,...,34,0,0,0,0,0,3,2010,-93.638933,42.060899


In [7]:
# for each house create a new column with diffrence between Year_Built and Year_Sold
data['House_Age'] = data['Year_Sold'] - data['Year_Built']
data['Remodel_Age'] = data['Year_Sold'] - data['Year_Remod_Add']
data['Total_Square_Footage'] = data['First_Flr_SF'] + data['Second_Flr_SF'] + data['Total_Bsmt_SF']
data = pd.get_dummies(data, columns=['Neighborhood'], drop_first=True)
data['Has_FirePlace'] = data['Fireplaces'] > 0
data['Has_Pool'] = data['Pool_Area'] > 0

In [8]:
# Assuming we have the coordinates of the city center of Ames, Iowa
city_center_coords = (42.034534, -93.620369)

# Function to calculate the Haversine distance
def haversine(lat1, lon1, lat2, lon2):
    from math import radians, cos, sin, sqrt, atan2
    R = 6371  # Radius of the Earth in kilometers
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return distance

# Assuming the dataset has 'Latitude' and 'Longitude' columns for each house
data['Distance_From_City_Center'] = data.apply(
    lambda row: haversine(row['Latitude'], row['Longitude'], city_center_coords[0], city_center_coords[1]), axis=1)
# fill the missing values with median
data['Distance_From_City_Center'].fillna(data['Distance_From_City_Center'].median(), inplace=True)
data['Lot_Frontage'].fillna(data['Lot_Frontage'].median(), inplace=True)

In [10]:
class SmartDataPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.city_center_coords = (42.034534, -93.620369)
        self.median_lot_frontage = None
        self.median_distance_from_city_center = None

    def haversine(self, lat1, lon1, lat2, lon2):
        from math import radians, cos, sin, sqrt, atan2
        R = 6371  # Radius of the Earth in kilometers
        dlat = radians(lat2 - lat1)
        dlon = radians(lon2 - lon1)
        a = sin(dlat / 2) ** 2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2) ** 2
        c = 2 * atan2(sqrt(a), sqrt(1 - a))
        distance = R * c
        return distance

    def fit(self, X, y=None):
        self.median_lot_frontage = X['Lot_Frontage'].median()
        self.median_distance_from_city_center = X.apply(
            lambda row: self.haversine(row['Latitude'], row['Longitude'], self.city_center_coords[0], self.city_center_coords[1]), axis=1).median()
        return self

    def transform(self, X):
        X = X.copy()
        X['House_Age'] = X['Year_Sold'] - X['Year_Built']
        X['Remodel_Age'] = X['Year_Sold'] - X['Year_Remod_Add']
        X['Total_Square_Footage'] = X['First_Flr_SF'] + X['Second_Flr_SF'] + X['Total_Bsmt_SF']
        X = pd.get_dummies(X, columns=['Neighborhood'], drop_first=True)
        X['Has_FirePlace'] = X['Fireplaces'] > 0
        X['Has_Pool'] = X['Pool_Area'] > 0
        X['Distance_From_City_Center'] = X.apply(
            lambda row: self.haversine(row['Latitude'], row['Longitude'], self.city_center_coords[0], self.city_center_coords[1]), axis=1)
        X['Distance_From_City_Center'].fillna(self.median_distance_from_city_center, inplace=True)
        X['Lot_Frontage'].fillna(self.median_lot_frontage, inplace=True)
        return X