In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def view_housing_data(file_path):
    # Read the CSV file into a DataFrame
    try:
        housing_data = pd.read_csv(file_path)
        # Display the first few rows of the DataFrame
        display(housing_data.head())
        return housing_data
    except FileNotFoundError:
        print("File not found. Please check the file path and try again.")
        return None

# Specify the path to your housing.csv file
file_path = "/Users/sm/Developer/Projects/Calefornia Housing Prices/data/housing.csv"

# Call the function to view the housing data
housing_df = view_housing_data(file_path)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [33]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [34]:
housing_df.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

## Transformation pipeline
Scikit_learn provides the pipeline class to help with sequence of transfromation. Let's upgrade our first EDA by running the pipeline:
- Let's start with numerical attributes


Remember: standard scalar is : The mean of the normalized values is 0 and the standard deviation of the normalized values is 1.
- The normalized values represent the number of standard deviations that the original value is from the mean.
- In this code, StandardScaler() initializes a new scaler. scaler.fit_transform(X) computes the mean and standard deviation of X for scaling later on (fit), and then scales X (transform). The transformed data - X_scaled - will have a mean of 0 and a variance of 1.


In [35]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Define the numerical pipeline
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("standardize", StandardScaler()),
])

# Define the categorical pipeline
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder()),
])

# Define the numerical and categorical attributes
num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms",
               "total_bedrooms", "population", "households", "median_income"]
cat_attribs = ["ocean_proximity"]

# take (3 tuplets) as arguments, 1 for name, 2 for the pipeline Transfromation, 3 for the attributes (where to apply the transformation)
# Define the preprocessing pipeline
preprocessing = ColumnTransformer([
    ("num", num_pipeline, num_attribs), #change this to 
    ("cat", cat_pipeline, cat_attribs),
])

##bonus!!!!!!!!!!!!!!!!!!!!!!!

#listing all columns names is not convienent, we can use make_column selector top update  our preprocessing
preprocessing_optimized = ColumnTransformer([
    ("num", make_column_selector(dtype_include=np.number)), #changed
    ("cat", make_column_selector(dtype_include=object)), #changedd
])



Let's apply it to our housing_data

It is most convient to have a single transformer capable of handling all columns
- Here we can use the columntransformer that applies num_pipeline to both numerical and categorical attributes


In [36]:
housing_prepared = preprocessing.fit_transform(housing_df)

housing_prepared_fr = pd.DataFrame(
    housing_prepared,
    columns=preprocessing.get_feature_names_out(),
    index=housing_df.index)

housing_prepared_fr.head()

Unnamed: 0,num__longitude,num__latitude,num__housing_median_age,num__total_rooms,num__total_bedrooms,num__population,num__households,num__median_income,cat__ocean_proximity_<1H OCEAN,cat__ocean_proximity_INLAND,cat__ocean_proximity_ISLAND,cat__ocean_proximity_NEAR BAY,cat__ocean_proximity_NEAR OCEAN
0,-1.327835,1.052548,0.982143,-0.804819,-0.972476,-0.974429,-0.977033,2.344766,0.0,0.0,0.0,1.0,0.0
1,-1.322844,1.043185,-0.607019,2.04589,1.357143,0.861439,1.669961,2.332238,0.0,0.0,0.0,1.0,0.0
2,-1.332827,1.038503,1.856182,-0.535746,-0.827024,-0.820777,-0.843637,1.782699,0.0,0.0,0.0,1.0,0.0
3,-1.337818,1.038503,1.856182,-0.624215,-0.719723,-0.766028,-0.733781,0.932968,0.0,0.0,0.0,1.0,0.0
4,-1.337818,1.038503,1.856182,-0.462404,-0.612423,-0.759847,-0.629157,-0.012881,0.0,0.0,0.0,1.0,0.0
