In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from statsmodels.api import OLS

In [None]:
train_data = pd.read_csv("/content/sample_data/california_housing_train.csv")
test_data = pd.read_csv("/content/sample_data/california_housing_test.csv")
print(train_data.head())
print(test_data.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -114.31     34.19                15.0       5612.0          1283.0   
1    -114.47     34.40                19.0       7650.0          1901.0   
2    -114.56     33.69                17.0        720.0           174.0   
3    -114.57     33.64                14.0       1501.0           337.0   
4    -114.57     33.57                20.0       1454.0           326.0   

   population  households  median_income  median_house_value  
0      1015.0       472.0         1.4936             66900.0  
1      1129.0       463.0         1.8200             80100.0  
2       333.0       117.0         1.6509             85700.0  
3       515.0       226.0         3.1917             73400.0  
4       624.0       262.0         1.9250             65500.0  
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.05     37.37                27.0       3885.0           661.0   
1    -118.30     34.2

In [None]:
test_data.drop(columns=['latitude', 'longitude'], axis=1, inplace=True)
train_data.drop(columns=['latitude', 'longitude'], axis=1, inplace=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   housing_median_age  3000 non-null   float64
 1   total_rooms         3000 non-null   float64
 2   total_bedrooms      3000 non-null   float64
 3   population          3000 non-null   float64
 4   households          3000 non-null   float64
 5   median_income       3000 non-null   float64
 6   median_house_value  3000 non-null   float64
dtypes: float64(7)
memory usage: 164.2 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   housing_median_age  17000 non-null  float64
 1   total_rooms         17000 non-null  float64
 2   total_bedrooms      17000 non-null  float64
 3   population          17000 non-null  float64
 4  

In [None]:
feature_cols = [
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income'
]

X_train = train_data[feature_cols]
y_train = train_data['median_house_value']
X_test = test_data[feature_cols]
y_test = test_data['median_house_value']

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred = np.round(y_pred)

OLS(y_test, y_pred).fit().summary()

0,1,2,3
Dep. Variable:,median_house_value,R-squared (uncentered):,0.895
Model:,OLS,Adj. R-squared (uncentered):,0.895
Method:,Least Squares,F-statistic:,25450.0
Date:,"Fri, 29 Aug 2025",Prob (F-statistic):,0.0
Time:,10:38:34,Log-Likelihood:,-37982.0
No. Observations:,3000,AIC:,75970.0
Df Residuals:,2999,BIC:,75970.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.9860,0.006,159.541,0.000,0.974,0.998

0,1,2,3
Omnibus:,624.02,Durbin-Watson:,2.017
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1934.718
Skew:,1.053,Prob(JB):,0.0
Kurtosis:,6.323,Cond. No.,1.0


In [None]:
y_train_pred = model.predict(X_train)
y_train_pred = np.round(y_train_pred)

OLS(y_train, y_train_pred).fit().summary()

0,1,2,3
Dep. Variable:,median_house_value,R-squared (uncentered):,0.898
Model:,OLS,Adj. R-squared (uncentered):,0.898
Method:,Least Squares,F-statistic:,149400.0
Date:,"Fri, 29 Aug 2025",Prob (F-statistic):,0.0
Time:,10:42:35,Log-Likelihood:,-215160.0
No. Observations:,17000,AIC:,430300.0
Df Residuals:,16999,BIC:,430300.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.0000,0.003,386.459,0.000,0.995,1.005

0,1,2,3
Omnibus:,3327.534,Durbin-Watson:,1.184
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11037.56
Skew:,0.986,Prob(JB):,0.0
Kurtosis:,6.42,Cond. No.,1.0
