In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [11]:
# Load the dataset 
file_path = '/mnt/data/california_housing_train.csv'
data = pd.read_csv('california_housing_train.csv')

In [13]:
# Display the first few rows of the dataset to understand its structure
print(data.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -114.31     34.19                15.0       5612.0          1283.0   
1    -114.47     34.40                19.0       7650.0          1901.0   
2    -114.56     33.69                17.0        720.0           174.0   
3    -114.57     33.64                14.0       1501.0           337.0   
4    -114.57     33.57                20.0       1454.0           326.0   

   population  households  median_income  median_house_value  
0      1015.0       472.0         1.4936             66900.0  
1      1129.0       463.0         1.8200             80100.0  
2       333.0       117.0         1.6509             85700.0  
3       515.0       226.0         3.1917             73400.0  
4       624.0       262.0         1.9250             65500.0  


In [15]:
# Feature selection: Using a subset of relevant features
X = data[['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']]

# Target variable: 'median_house_value'
y = data['median_house_value']

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Display predictions alongside actual values
predictions_df = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred})
print(predictions_df.head())

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {r2:.2f}')

# Display the model's coefficients
print('Model Coefficients:', model.coef_)
print('Intercept:', model.intercept_)


     Actual      Predicted
0  142700.0  168910.337280
1  500001.0  436527.240023
2   61800.0  107848.501528
3  162800.0  156468.883156
4   90600.0  138092.631244
Mean Squared Error: 5558804420.35
R-squared: 0.60
Model Coefficients: [ 1.84979273e+03 -2.06522507e+01  9.59686929e+01 -3.25257775e+01
  1.28472574e+02  4.77056251e+04]
Intercept: -46224.835733641026
