In [66]:
%pip install numpy pandas scikit-learn imbalanced-learn matplotlib seaborn

Note: you may need to restart the kernel to use updated packages.


In [67]:
import numpy as np
import pandas as pd

df = pd.read_csv('./Housing.csv')
print(df.head())

      price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0  13300000  7420         4          2        3      yes        no       no   
1  12250000  8960         4          4        4      yes        no       no   
2  12250000  9960         3          2        2      yes        no      yes   
3  12215000  7500         4          2        2      yes        no      yes   
4  11410000  7420         4          1        2      yes       yes      yes   

  hotwaterheating airconditioning  parking prefarea furnishingstatus  
0              no             yes        2      yes        furnished  
1              no             yes        3       no        furnished  
2              no              no        2      yes   semi-furnished  
3              no             yes        3      yes        furnished  
4              no             yes        2       no        furnished  


In [68]:
# Encode specific binary columns
binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
df[binary_cols] = df[binary_cols].apply(lambda col: col.str.lower())
df[binary_cols] = df[binary_cols].replace({'yes': 1, 'no': 0}).astype(int)
print(df.head())

      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

  furnishingstatus  
0        furnished  
1        furnished  
2   semi-furnished  
3        furnished  
4        furnished  


  df[binary_cols] = df[binary_cols].replace({'yes': 1, 'no': 0}).astype(int)


In [69]:
# Get unique values
print(df["furnishingstatus"].unique())

['furnished' 'semi-furnished' 'unfurnished']


In [70]:
# One-hot encode furnishingstatus
df = pd.get_dummies(df, columns=['furnishingstatus'])
print(df.head())

      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

   furnishingstatus_furnished  furnishingstatus_semi-furnished  \
0                        True                            False   
1                        T

In [71]:
df = df.astype({col: int for col in df.select_dtypes('bool').columns})
print(df.head())

      price  area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0  13300000  7420         4          2        3         1          0   
1  12250000  8960         4          4        4         1          0   
2  12250000  9960         3          2        2         1          0   
3  12215000  7500         4          2        2         1          0   
4  11410000  7420         4          1        2         1          1   

   basement  hotwaterheating  airconditioning  parking  prefarea  \
0         0                0                1        2         1   
1         0                0                1        3         0   
2         1                0                0        2         1   
3         1                0                1        3         1   
4         1                0                1        2         0   

   furnishingstatus_furnished  furnishingstatus_semi-furnished  \
0                           1                                0   
1                         

In [72]:
from sklearn.preprocessing import StandardScaler
def scale_dataset(df):
    X = df.drop("price", axis=1)
    y = df["price"]
    scaler = StandardScaler()
    features_to_be_scaled = ["area"]
    X[features_to_be_scaled] = scaler.fit_transform(X[features_to_be_scaled])
    data = np.hstack([X, y.values.reshape(-1, 1)])
    return data,X,y

In [73]:
from sklearn.model_selection import train_test_split

train, temp = train_test_split(
    df, test_size=0.3, random_state=42
)

val, test = train_test_split(
    temp, test_size=0.5, random_state=42
)


In [74]:
data_train, X_train, y_train = scale_dataset(train)
data_val, X_val, y_val = scale_dataset(val)
data_test, X_test, y_test = scale_dataset(test)

print(X_train.head())

         area  bedrooms  bathrooms  stories  mainroad  guestroom  basement  \
126  0.934301         3          1        1         1          0         1   
363 -0.710246         2          1        1         1          0         0   
370 -0.390167         2          1        1         1          0         0   
31   0.860719         3          1        4         1          0         0   
113  2.065617         3          1        1         1          0         1   

     hotwaterheating  airconditioning  parking  prefarea  \
126                0                0        2         1   
363                1                0        0         0   
370                0                1        2         0   
31                 0                1        2         0   
113                0                0        2         1   

     furnishingstatus_furnished  furnishingstatus_semi-furnished  \
126                           0                                0   
363                           0   

In [75]:
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)



In [76]:
y_val_pred = regression_model.predict(X_val)
print(y_val_pred)

[3943503.90654066 4539836.03238287 3240965.28152913 6458139.21045602
 3699635.21610088 5579076.10150898 4064309.8321191  5930083.60652542
 3498607.43654938 7927846.36915466 4168210.82329273 3901069.88052467
 3668550.67609153 7953470.16134262 5221991.03523587 5363625.34889638
 3940670.95690913 5185999.93288959 4406088.9430602  3507343.93817026
 3603954.03081767 3273186.90417321 4495174.64649202 3596285.41329449
 5774125.58655258 7641033.91585495 4420408.28883014 3525010.51156083
 6651406.48070819 6351572.82292645 3996371.44890499 4667037.14277784
 2301593.04005798 7087989.80293661 5974363.93758567 5321322.67442955
 5275288.35155357 6735982.15557714 8307588.24431388 2737497.27255449
 8026547.71543022 3190900.65991016 3374392.62718587 2705091.9731835
 2563972.46978712 4455079.44961124 4735692.80121554 4535781.6383497
 4023971.25281667 3104931.83084992 5407520.82254697 4160102.04460727
 3727222.84682753 6395693.08987901 7352475.39784854 7022707.58658871
 4208342.25020993 3755014.9186602  7

In [89]:
from sklearn.metrics import mean_squared_error, r2_score
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
val_r2 = r2_score(y_val, y_val_pred)

print("\nValidation Results:")
print("RMSE:", val_rmse)
print("R² Score:", val_r2)

print(regression_model.coef_)


Validation Results:
RMSE: 1328132.020620229
R² Score: 0.5916473486222811
[ 550762.64317168   80893.141271   1114751.17782697  417267.58010832
  408073.6819066   275710.53305175  482603.52188223  616375.42271861
  685839.34559965  303111.23633571  509192.08881309  170947.96792849
   49295.30492813 -220243.27285662]


In [78]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.1, random_state=42)  # alpha = regularization strength
lasso.fit(X_train, y_train)

In [81]:
y_val_pred_lasso = lasso.predict(X_val)
print(y_val_pred_lasso)

[3943504.34213717 4539836.02162813 3240965.99674748 6458138.92088942
 3699635.63447255 5579075.66125586 4064310.12277794 5930081.13671616
 3498608.28269328 7927845.81875231 4168211.22210108 3901069.83080883
 3668551.35710421 7953470.31764813 5221991.01878395 5363625.92164336
 3940671.23065701 5186000.20148568 4406089.39020373 3507344.53761629
 3603954.35083768 3273188.2801851  4495174.69623886 3596285.78273959
 5774125.13843919 7641033.3584501  4420407.74683492 3525011.02724963
 6651406.17600533 6351572.00238965 3996371.87048134 4667037.54232138
 2301593.91218688 7087989.37429333 5974363.48879095 5321322.88336871
 5275288.36828052 6735982.31550534 8307585.27217675 2737498.04671604
 8026546.56131687 3190901.15308692 3374393.46582669 2705092.45174886
 2563973.22560738 4455079.30041495 4735692.9720853  4535782.26073935
 4023971.36965911 3104932.0460126  5407521.05762815 4160102.00149012
 3727223.18587204 6395692.27401586 7352475.46293593 7022707.16620771
 4208342.60975793 3755015.61784122

In [82]:
val_rmse_lasso = np.sqrt(mean_squared_error(y_val, y_val_pred_lasso))
val_r2_lasso = r2_score(y_val, y_val_pred_lasso)

print("\nValidation Results:")
print("RMSE:", val_rmse_lasso)
print("R² Score:", val_r2_lasso)


Validation Results:
RMSE: 1328132.0641596988
R² Score: 0.5916473218486503


In [83]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0, random_state=42)  # alpha = regularization strength
ridge.fit(X_train, y_train)


In [85]:
y_val_pred_ridge = ridge.predict(X_val)
print(y_val_pred_lasso)


[3943504.34213717 4539836.02162813 3240965.99674748 6458138.92088942
 3699635.63447255 5579075.66125586 4064310.12277794 5930081.13671616
 3498608.28269328 7927845.81875231 4168211.22210108 3901069.83080883
 3668551.35710421 7953470.31764813 5221991.01878395 5363625.92164336
 3940671.23065701 5186000.20148568 4406089.39020373 3507344.53761629
 3603954.35083768 3273188.2801851  4495174.69623886 3596285.78273959
 5774125.13843919 7641033.3584501  4420407.74683492 3525011.02724963
 6651406.17600533 6351572.00238965 3996371.87048134 4667037.54232138
 2301593.91218688 7087989.37429333 5974363.48879095 5321322.88336871
 5275288.36828052 6735982.31550534 8307585.27217675 2737498.04671604
 8026546.56131687 3190901.15308692 3374393.46582669 2705092.45174886
 2563973.22560738 4455079.30041495 4735692.9720853  4535782.26073935
 4023971.36965911 3104932.0460126  5407521.05762815 4160102.00149012
 3727223.18587204 6395692.27401586 7352475.46293593 7022707.16620771
 4208342.60975793 3755015.61784122

In [88]:
rmse_ridge = np.sqrt(mean_squared_error(y_val, y_val_pred_ridge))
r2_ridge = r2_score(y_val, y_val_pred_ridge)

print("Validation Results with L2 (Ridge):")
print("RMSE:", rmse_ridge)
print("R² Score:", r2_ridge)

print("\nCoefficients:")
print(ridge.coef_)

Validation Results with L2 (Ridge):
RMSE: 1327576.488579299
R² Score: 0.5919888892736643

Coefficients:
[ 552098.87589371   84751.80657678 1100442.89190256  418770.48667806
  400504.05599796  274562.21110328  480471.14437009  582111.96798344
  676250.02755813  304684.17030016  502669.07952773  171539.78073761
   49325.11316746 -220864.89390503]


In [98]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import GridSearchCV, train_test_split

binary_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']
numeric_scale = ['area']           # only scale this
numeric_keep = ['bedrooms', 'bathrooms', 'stories', 'parking']  # keep as-is
categorical_cols = ['furnishingstatus']

def yes_no_to_binary(X):
    return X.applymap(lambda val: 1 if str(val).lower() == "yes" else 0)

preprocessor = ColumnTransformer(transformers=[
    ("binary", FunctionTransformer(lambda X: yes_no_to_binary(X)), binary_cols),
    ("scaled_num", StandardScaler(), numeric_scale),
    ("num_keep", "passthrough", numeric_keep),
    ("cat", OneHotEncoder(), categorical_cols)
])

pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())  # placeholder, GridSearch will replace
])
df = pd.read_csv('./Housing.csv')
X = df.drop("price", axis=1)  # target = price
y = df["price"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

param_grid = [
    {"model": [LinearRegression()]},  # plain linear regression
    {"model": [Ridge()], "model__alpha": [0.1, 1, 10, 50, 100]},
    {"model": [Lasso(max_iter=10000)], "model__alpha": [0.001, 0.01, 0.1, 1, 10]}
]

grid = GridSearchCV(pipe, param_grid, cv=5, scoring="r2", n_jobs=-1)

grid.fit(X_train, y_train)

# ------------------------------
# Results
# ------------------------------
print("Best Model:", grid.best_estimator_)
print("Best Params:", grid.best_params_)
print("Best CV Score (R²):", grid.best_score_)

# Validation set score
val_score = grid.score(X_val, y_val)
print("Validation R² Score:", val_score)

# Predictions
y_val_pred = grid.predict(X_val)

# RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Validation RMSE:", rmse)




Best Model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('binary',
                                                  FunctionTransformer(func=<function <lambda> at 0x169a3ea20>),
                                                  ['mainroad', 'guestroom',
                                                   'basement',
                                                   'hotwaterheating',
                                                   'airconditioning',
                                                   'prefarea']),
                                                 ('scaled_num',
                                                  StandardScaler(), ['area']),
                                                 ('num_keep', 'passthrough',
                                                  ['bedrooms', 'bathrooms',
                                                   'stories', 'parking']),
                                                 ('cat', OneHotEncod

  return X.applymap(lambda val: 1 if str(val).lower() == "yes" else 0)
  return X.applymap(lambda val: 1 if str(val).lower() == "yes" else 0)
  return X.applymap(lambda val: 1 if str(val).lower() == "yes" else 0)
