In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def view_housing_data(file_path):
    # Read the CSV file into a DataFrame
    try:
        housing_data = pd.read_csv(file_path)
        # Display the first few rows of the DataFrame
        display(housing_data.head())
        return housing_data
    except FileNotFoundError:
        print("File not found. Please check the file path and try again.")
        return None

# Specify the path to your housing.csv file
file_path = "/Users/sm/Developer/Projects/Calefornia Housing Prices/data/housing.csv"

# Call the function to view the housing data
housing = view_housing_data(file_path) 45454545

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [68]:
def shuffle_and_split_data(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    
    return data.iloc[train_indices], data.iloc[test_indices]

train_set, test_set = shuffle_and_split_data(housing, 0.2)

In [69]:
len(train_set)

16512

In [70]:
len(test_set)

4128

In [71]:
np.random.seed(42)

In [72]:
train_set.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,103000.0,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,382100.0,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,172600.0,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,93400.0,NEAR OCEAN
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,96500.0,INLAND


In [73]:
train_set.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

## Building a pipeline

Before we can train some models we would like to create a single pipeline, that can perform all transformations necessary preperations:

1. Handling missing values in numerical features, will be imputed by replacing them with the median
- - In caterigorical value missing values will be replaced by most frequent categories

2. Cateogorical features will be one-hot encoded, since most ML algorithms accepts onnly numerical inputs

3. A few ratios will be computed and added they are:
- - Bedrooms ratio
- - Rooms-per-house ratio
- - People per house ratio
Theese will potentially correlate better with the machine learning models

4. Features with long tail will be replaced by their logarithms to help for gaussian distribtuions

5. All numerical features will be standardized as most ML algorthims prefer features with same scale

6. A few cluster similiarty features will also be added, most usefull for models containing latitude and longtitude

In [74]:
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.cluster import KMeans 
from sklearn.base import BaseEstimator, TransformerMixin

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(self.n_clusters, n_init=10,
                              random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self  # always return self!

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)
    
    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]
  

def column_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return ["ratio"]  # feature names out

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())
    
log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())

cat_pipeline = make_pipeline(  
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)

default_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())

#now let's run the preprocessing pipeline
preprocessing = ColumnTransformer([
        ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline(), ["population", "households"]),
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population", "households", "median_income"]),
        ("cat", cat_pipeline, ['ocean_proximity']),
        ("geo", cluster_simil, ["latitude", "longitude"]),
    ],
    remainder=default_num_pipeline)  # one column remaining: housing_median_age

housing_prepared = preprocessing.fit_transform(train_set)


In [75]:
housing_prepared.shape

(16512, 25)

In [76]:
preprocessing.get_feature_names_out()

array(['bedrooms__ratio', 'rooms_per_house__ratio',
       'people_per_house__ratio', 'log__total_bedrooms',
       'log__total_rooms', 'log__population', 'log__households',
       'log__median_income', 'cat__ocean_proximity_<1H OCEAN',
       'cat__ocean_proximity_INLAND', 'cat__ocean_proximity_ISLAND',
       'cat__ocean_proximity_NEAR BAY', 'cat__ocean_proximity_NEAR OCEAN',
       'geo__Cluster 0 similarity', 'geo__Cluster 1 similarity',
       'geo__Cluster 2 similarity', 'geo__Cluster 3 similarity',
       'geo__Cluster 4 similarity', 'geo__Cluster 5 similarity',
       'geo__Cluster 6 similarity', 'geo__Cluster 7 similarity',
       'geo__Cluster 8 similarity', 'geo__Cluster 9 similarity',
       'remainder__housing_median_age', 'remainder__median_house_value'],
      dtype=object)

## Linear Regression Method

In [77]:
# Convert the numpy array back to a DataFrame
df = pd.DataFrame(housing_prepared, columns=preprocessing.get_feature_names_out())

df.head()

Unnamed: 0,bedrooms__ratio,rooms_per_house__ratio,people_per_house__ratio,log__total_bedrooms,log__total_rooms,log__population,log__households,log__median_income,cat__ocean_proximity_<1H OCEAN,cat__ocean_proximity_INLAND,...,geo__Cluster 2 similarity,geo__Cluster 3 similarity,geo__Cluster 4 similarity,geo__Cluster 5 similarity,geo__Cluster 6 similarity,geo__Cluster 7 similarity,geo__Cluster 8 similarity,geo__Cluster 9 similarity,remainder__housing_median_age,remainder__median_house_value
0,-0.211785,-0.174916,0.051376,0.53019,0.553949,0.970437,0.617532,-0.138118,0.0,0.0,...,0.033969,4.808071e-20,1.012848e-18,3.911411e-10,8.72546e-43,8e-06,0.157687,9.330202e-25,0.34849,-0.901189
1,0.342185,-0.402835,-0.117362,0.841039,0.658854,0.21082,0.881846,0.194031,0.0,0.0,...,0.908636,4.117604e-12,3.323517e-11,4.369537e-05,9.481086000000001e-31,0.028986,0.765456,8.596216e-16,1.618118,1.512771
2,-0.661658,0.088216,-0.03228,-0.343523,-0.111731,-0.280239,-0.225887,0.377064,0.0,0.0,...,0.005645,0.0001597003,4.405316e-05,0.01053985,2.292419e-18,0.492835,0.000205,7.220443e-07,-1.95271,-0.299213
3,0.783032,-0.600015,0.077507,-0.202319,-0.496778,0.314174,-0.150748,-1.235444,0.0,0.0,...,0.039101,8.720210999999999e-20,1.546906e-18,5.007039e-10,1.665847e-42,1.1e-05,0.165174,1.752503e-24,0.586545,-0.98422
4,-0.550364,0.349007,-0.068832,0.017531,0.191691,-0.342443,-0.057787,0.045308,0.0,1.0,...,6e-05,0.008830894,0.08856031,0.9249874,2.608742e-11,0.027763,3e-06,0.0004555046,1.142008,-0.957408
