In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score

In [None]:
house_data = pd.read_csv("housing.csv")

In [None]:
house_data.head()

In [None]:
x_features = house_data.drop("median_house_value",axis=1)
y_target= house_data["median_house_value"]

In [None]:
x_features_train,x_features_test,y_target_train,y_target_test = train_test_split(x_features,y_target,test_size=0.2,random_state=42)

In [None]:
x_features_train.shape,x_features_test.shape

In [None]:
numerical_cols = x_features_train.select_dtypes(include=np.number).columns
categorical_cols = x_features_train.select_dtypes("object").columns

In [None]:
num_pipeline = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="median")),
    ("scaler",StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("encoder",OneHotEncoder(drop='first',handle_unknown='ignore',sparse_output=False))
])

In [None]:
preprocessor = ColumnTransformer(transformers=[
    ("num",num_pipeline,numerical_cols),
    ("cat",cat_pipeline,categorical_cols)
],remainder='passthrough')

In [None]:
model = Pipeline(steps=[
    ("preprocessing",preprocessor),
    ("model",RandomForestRegressor())
])

# Alernative
# from sklearn.pipeline import make_pipeline
# model = make_pipeline(preprocessor,RandomForestRegressor())       # Here we don't need to pass names

In [None]:
model.fit(x_features_train,y_target_train)

In [None]:
y_pred = model.predict(x_features_test)

In [None]:
r2_score(y_target_test,y_pred)

In [None]:
# model.named_steps
# model.named_steps['preprocessing'].transformers_[1][1].named_steps['encoder'].get_feature_names_out()

In [None]:
# import pickle
# import numpy as np

In [None]:
# pickle.dump(model,open("model.pkl","wb"))

Predicting after pickle dump

In [None]:
# import numpy as np
# import pickle
# model = pickle.load(open('model.pkl','rb'))

In [None]:
# test_input = [-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,'NEAR BAY']

In [None]:
# import pandas as pd

# input_data = pd.DataFrame(
#     [[-122.23, 37.88, 41.0, 880.0, 129.0, 322.0, 126.0, 8.3252, "NEAR BAY"]],
#     columns=[
#         "longitude",
#         "latitude",
#         "housing_median_age",
#         "total_rooms",
#         "total_bedrooms",
#         "population",
#         "households",
#         "median_income",
#         "ocean_proximity"
#     ]
# )

In [None]:
# prediction = model.predict(input_data)
# print("Predicted median house value:", prediction[0])
