In [3]:
import pandas as pd
from linear_regression import LinearRegression1

df = pd.read_csv("housing.csv")

matrix = df.copy()
# fill missing values in total_bedrooms with median
matrix["total_bedrooms"] = matrix["total_bedrooms"].fillna(matrix["total_bedrooms"].median())

# set y to be the target variable (house values) and X to be the predictors (features)
y = matrix["median_house_value"].to_numpy(dtype=float)

numeric_cols = [
    "longitude", 
    "latitude", 
    "housing_median_age", 
    "total_rooms", 
    "total_bedrooms", 
    "population", 
    "households", 
    "median_income"   
]

X_num = matrix[numeric_cols]
X_categorical = pd.get_dummies(matrix["ocean_proximity"], prefix="ocean", drop_first=True, dtype=float)
X_df = pd.concat([X_num, X_categorical], axis=1)

X = X_df.to_numpy(dtype=float)
feature_names = X_df.columns.tolist()

### Let's create an instance of the class, and then call on the fit-method. This constitutes the regression???

In [4]:
model = LinearRegression1(alpha=0.01)
model.fit(X, y)

<linear_regression.LinearRegression1 at 0x2bd5b20de80>

### d - the number of features:

### n - the size of the sample

### What is the sample variance?

### What is the standard deviation?

### Calculating the RMSE:

### Let's do some tests on the regression and its parameters to analyse our linear regression. First out: let's do a significance test on the entire regression, a so called F-test

In [5]:
F, p = model.significance_regression()
print("F-statistic:", F)
print("p-value:", p)

F-statistic: 3129.2889229152033
p-value: 0.0


Extremely strong proof against the null hypothesis. This result means that the regression is a real pattern in the population, not random.

In [6]:
R2 = model.r_squares()
print("R-squared:", R2)

R-squared: 0.6454530166046624


Result of the statistical test R^2 can be between 0 and 1. Close to 1 meaning a lot of the variation in our predicted variable is actually explained by the modell. We got around 0.65. 


### Now let's test the significance of each coefficient/parameter/feature

In [7]:
t_test, p_values = model.t_test_coefficiants()

all_feature_names = ["Intercept"] + feature_names

# print t-values and p-values for each coefficient
print(f"{'Feature':<20} {'t-value':>10} {'p-value':>10}")
print("-" * 40)
for name, t, p in zip(all_feature_names, t_test, p_values):
    print(f"{name:<20} {t:>10.4f} {p:>10.4e}")
    


Feature                 t-value    p-value
----------------------------------------
Intercept              -25.5272 1.5442e-141
longitude              -26.0683 2.0260e-147
latitude               -25.1777 8.4456e-138
housing_median_age      24.2031 1.2406e-127
total_rooms             -6.1386 8.4792e-10
total_bedrooms          12.0274 3.2953e-33
population             -36.9277 4.0872e-289
households              11.6848 1.9142e-31
median_income          116.6703 0.0000e+00
ocean_INLAND           -22.9036 1.1506e-114
ocean_ISLAND             5.0716 3.9790e-07
ocean_NEAR BAY          -1.9398 5.2414e-02
ocean_NEAR OCEAN         3.0452 2.3284e-03


So median income definately affects y - median house value. 

### Now we want to see the Pearson number for every pair of features

In [13]:
corr_matrix = model.Pearson_correlation()

for i in range(len(feature_names)):
    for j in range(i + 1, len(feature_names)): # only print upper triangle of the correlation matrix to avoid duplicates
       print(f"{feature_names[i]} vs {feature_names[j]}: r = {corr_matrix[i, j]:.3f}")

longitude vs latitude: r = -0.925
longitude vs housing_median_age: r = -0.108
longitude vs total_rooms: r = 0.045
longitude vs total_bedrooms: r = 0.069
longitude vs population: r = 0.100
longitude vs households: r = 0.055
longitude vs median_income: r = -0.015
longitude vs ocean_INLAND: r = -0.056
longitude vs ocean_ISLAND: r = 0.009
longitude vs ocean_NEAR BAY: r = -0.474
longitude vs ocean_NEAR OCEAN: r = 0.046
latitude vs housing_median_age: r = 0.011
latitude vs total_rooms: r = -0.036
latitude vs total_bedrooms: r = -0.066
latitude vs population: r = -0.109
latitude vs households: r = -0.071
latitude vs median_income: r = -0.080
latitude vs ocean_INLAND: r = 0.351
latitude vs ocean_ISLAND: r = -0.017
latitude vs ocean_NEAR BAY: r = 0.359
latitude vs ocean_NEAR OCEAN: r = -0.161
housing_median_age vs total_rooms: r = -0.361
housing_median_age vs total_bedrooms: r = -0.319
housing_median_age vs population: r = -0.296
housing_median_age vs households: r = -0.303
housing_median_age v

### Let's check the confidence intervals for each parameter

In [9]:
lower, upper = model.confidence_intervals()
print(f"{'Feature':<20} {'Lower CI':>15} {'Upper CI':>15}")
print("-" * 50)
for name, low, up in zip(all_feature_names, lower, upper):
    print(f"{name:<20} {low:>15.4f} {up:>15.4f}")

Feature                     Lower CI        Upper CI
--------------------------------------------------
Intercept              -2458443.5410   -2007740.3989
longitude                -29042.2953     -23818.5940
latitude                 -27748.8973     -22597.6623
housing_median_age          945.2269       1170.4053
total_rooms                  -6.7158         -2.7454
total_bedrooms               56.0640         86.6259
population                  -42.0281        -36.5468
households                   60.6515         94.9574
median_income             37904.6211      39616.2736
ocean_INLAND             -44239.0969     -35293.7006
ocean_ISLAND              76793.7236     235337.7161
ocean_NEAR BAY            -8607.4923       1212.6890
ocean_NEAR OCEAN            733.0724       8784.4349
