# 🏡 California Housing Price Prediction using Machine Learning

**Author:** Kishlaya Sinha  
**GitHub:** [Kishlaya20sinha](https://github.com/Kishlaya20sinha)

This project uses Scikit-learn and regression algorithms to predict housing prices based on California census data.
It includes data preprocessing, EDA, multiple ML models, evaluation metrics, and visualizations.


In [3]:
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'pandas'

In [None]:
housing=pd.read_csv(r"C:\Users\BIT\Downloads\housing.csv (2)\housing.csv")

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
 %matplotlib inline   
 import matplotlib.pyplot as plt
 housing.hist(bins=50, figsize=(20,15))
 plt.show()

In [None]:
housing = housing.reset_index(drop=True)


In [None]:
 housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
 housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

In [None]:
strat_test_set["income_cat"].value_counts() / len(strat_test_set)

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude",edgecolor="black",
    linewidth=0.5,       # Optional: controls outline thickness
    alpha=0.2,         # Optional: make points slightly transparent
    figsize=(10, 7))

In [None]:
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
    s=housing["population"]/100, label="population", figsize=(10,7),
    c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
 )
plt.legend()

In [None]:
numeric_housing = housing.select_dtypes(include=["float64", "int64"])


In [None]:
corr_matrix = numeric_housing.corr()

In [None]:
 corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms",
 "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value",edgecolor="darkblue",
 alpha=0.1)

In [None]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]

In [None]:
numeric_housing = housing.select_dtypes(include=["float64", "int64"])


In [None]:
 corr_matrix = numeric_housing.corr()

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [None]:
 imputer.fit(numeric_housing)


In [None]:
 imputer.statistics_

In [None]:
X = imputer.transform(numeric_housing)

In [None]:
housing_tr = pd.DataFrame(X, columns=numeric_housing.columns)

In [None]:
housing_cat = housing["ocean_proximity"]
housing_cat.sample(10,random_state=42)

In [None]:
housing_cat_encoded, housing_categories = housing_cat.factorize()
housing_cat_encoded[:10]

In [None]:
 housing_categories

In [None]:
housing = strat_train_set.copy()

In [None]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1))
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()

In [None]:
from sklearn.preprocessing import OneHotEncoder
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat.values.reshape(-1, 1))

In [None]:
 cat_encoder.categories_

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np  # Make sure numpy is imported if not already

# Indexes of the columns in the dataset
rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room=True):  # No *args or **kwargs
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self  # Nothing to do

    def transform(self, X, y=None):
        rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
        population_per_household = X[:, population_ix] / X[:, household_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]

# Usage
attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer  # Use SimpleImputer, not Imputer

# Create the pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

# Apply the pipeline to the numeric data
housing_num_tr = num_pipeline.fit_transform(numeric_housing)


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.attribute_names].values


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer  # Use SimpleImputer instead of deprecated Imputer
from sklearn.preprocessing import StandardScaler

# Define attribute groups
num_attribs = list(numeric_housing)  # assuming housing_num is a DataFrame with numeric features
cat_attribs = ["ocean_proximity"]

# Numerical pipeline
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', SimpleImputer(strategy="median")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

# Categorical pipeline
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])


In [None]:
num_attribs = [
    'longitude', 'latitude', 'housing_median_age',
    'total_rooms', 'total_bedrooms', 'population',
    'households', 'median_income'
]


In [None]:
# Final full pipeline
full_pipeline = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ]), num_attribs),

    ('cat', Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ]), cat_attribs),
])



In [None]:
housing_prepared = full_pipeline.fit_transform(housing)

In [None]:
housing_prepared

In [None]:
 housing_prepared.shape

In [None]:
housing_labels = housing["median_house_value"].copy()


In [None]:
 from sklearn.linear_model import LinearRegression
 lin_reg = LinearRegression()
 lin_reg.fit(housing_prepared, housing_labels)

In [None]:
from sklearn.metrics import mean_squared_error
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

In [None]:
 from sklearn.tree import DecisionTreeRegressor
 tree_reg = DecisionTreeRegressor()
 tree_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, housing_prepared, housing_labels,
scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [None]:
def display_scores(scores):
    print("Scores:", scores)    
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())


In [None]:
display_scores(tree_rmse_scores)

In [None]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,                              
scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
import numpy as np

# Create and train the model
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

# Cross-validation (returns negative MSE, hence the minus sign)
forest_scores = cross_val_score(
    forest_reg, housing_prepared, housing_labels,
    scoring="neg_mean_squared_error", cv=10
)

# Calculate RMSE from the negative MSE
forest_rmse_scores = np.sqrt(-forest_scores)

# Show results
print("Scores:", forest_rmse_scores)
print("Mean:", forest_rmse_scores.mean())
print("Standard deviation:", forest_rmse_scores.std())


In [None]:
!pip install joblib

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [
{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
scoring='neg_mean_squared_error')
grid_search.fit(housing_prepared, housing_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_


In [None]:
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=8, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=30, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_

In [None]:
feature_importances

In [None]:
cat_pipeline.fit(housing[cat_attribs])


In [None]:
cat_encoder.get_feature_names_out(['ocean_proximity'])


In [None]:
cat_encoder = full_pipeline.named_transformers_["cat"].named_steps["onehot"]
cat_one_hot_attribs = list(cat_encoder.get_feature_names_out(['ocean_proximity']))

extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
attributes = num_attribs + extra_attribs + cat_one_hot_attribs

print(attributes)


In [None]:
sorted(zip(feature_importances, attributes), reverse=True)

In [None]:
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse) 