In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

train_data = pd.read_csv('../house_data/train.csv')
test_data = pd.read_csv('../house_data/test.csv')

# Remove excluded columns
columns_to_exclude = ['id', 'date', 'zipcode', 'price']
feature_columns = [col for col in train_data.columns if col not in columns_to_exclude]

# Separate features and target
x_train = train_data[feature_columns]
y_train = train_data['price'] / 1000  

x_test = test_data[feature_columns]
y_test = test_data['price'] / 1000  

# scaling the features
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

model = LinearRegression()
model.fit(x_train_scaled, y_train)

# Get predictions
y_train_pred = model.predict(x_train_scaled)
y_test_pred = model.predict(x_test_scaled)

# calculate metrics for the training and test data
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print('='*60)
print("LINEAR REGRESSION MODEL RESULTS")
print('='*60)
print("\n1. MODEL COEFFICIENTS:")
print(f"\nIntercept: {model.intercept_:.4f}")
print("\nFeature Coefficients:")
print("-" * 60)

# Creating a dataframe of coefficients
coef_df = pd.DataFrame({
    'Feature': feature_columns,
    'Coefficient': model.coef_
})
coef_df['Abs_Coefficient'] = np.abs(coef_df['Coefficient'])
coef_df = coef_df.sort_values('Abs_Coefficient', ascending=False)

for idx, row in coef_df.iterrows():
    print(f"{row['Feature']:20s}: {row['Coefficient']:10.4f}")

print("\n" + "="*60)
print("2. TRAINING SET METRICS")
print("="*60)
print(f"Mean Squared Error: {train_mse:.4f}")
print(f"R2 Score: {train_r2:.4f}") 

print("\n" + "="*60)
print("3. TESTING SET METRICS:")
print("="*60)
print(f"Mean Squared Error: {test_mse:.4f}")
print(f"R2 Score: {test_r2:.4f}")

print("\n" + "="*60)
print("COMPARISON:")
print("="*60)
print(f"MSE Ratio (Test/Train): {test_mse/train_mse:.4f}")
print(f"R2 Difference (Train - Test): {train_r2 - test_r2:.4f}")

LINEAR REGRESSION MODEL RESULTS

1. MODEL COEFFICIENTS:

Intercept: 520.4148

Feature Coefficients:
------------------------------------------------------------
grade               :    92.5111
lat                 :    78.1299
yr_built            :   -68.0432
waterfront          :    64.2309
sqft_living         :    57.1616
sqft_above          :    48.4391
view                :    47.6103
sqft_living15       :    45.4791
sqft_basement       :    27.6888
bathrooms           :    18.4569
yr_renovated        :    17.3419
sqft_lot15          :   -12.9066
bedrooms            :   -12.8073
condition           :    12.6476
sqft_lot            :    11.1273
Unnamed: 0          :     8.4560
floors              :     8.1510
long                :    -1.4377

2. TRAINING SET METRICS
Mean Squared Error: 31415.7479
R2 Score: 0.7271

3. TESTING SET METRICS:
Mean Squared Error: 58834.6740
R2 Score: 0.6471

COMPARISON:
MSE Ratio (Test/Train): 1.8728
R2 Difference (Train - Test): 0.0800


# Interpretation of the results
- The most important features of the model are the grade of the household (92.51 correlation), latitude (78.13) and year built(-68.04)
- The model has a moderate fit with the possibility of overfitting
    - The 8% difference in r2 score shows the model has identified some patterns specific to the training data the don't generalize well
    - about 35.3% of the variance in the test data remains to be explained, suggesting there could be important factors not captured by the linear relationships with the features
- The model error for the training data was $177,250 and for the testing data was $242,560
    - This level of error represents significant uncertainty in price predictions from the model
- The training data performs 1.87 times better than the testing data of the model suggesting that the model is suffering from overfitting on its features
