In [21]:
import pandas as pd
from linear_regression import LinearRegression1
import numpy as np

df = pd.read_csv("housing.csv")

matrix = df.copy()

matrix["total_bedrooms"] = matrix["total_bedrooms"].fillna(matrix["total_bedrooms"].median())

y = matrix["median_house_value"].to_numpy(dtype=float)

numeric_cols = [ 
    "latitude", 
    "housing_median_age", 
    "total_bedrooms",  
    "median_income"   
]

X_num = matrix[numeric_cols]
X_categorical = pd.get_dummies(matrix["ocean_proximity"], prefix="ocean", drop_first=True, dtype=float)
X_df = pd.concat([X_num, X_categorical], axis=1)

X = X_df.to_numpy(dtype=float)
feature_names = X_df.columns.tolist()

### Let's create an instance of the class, and then call on the first method.

In [22]:
model = LinearRegression1(alpha=0.01)
model.fit(X, y)

<linear_regression.LinearRegression1 at 0x1fbe44aec10>

### d - the number of features:

In [23]:
model.d

8

### n - the size of the sample

In [24]:
model.n

20640

### What is the sample variance?

In [25]:
model.variance()

np.float64(5258494301.386951)

### What is the standard deviation?

In [26]:
model.standard_deviation()

np.float64(72515.47628876854)

### Calculating the RMSE:

In [27]:
model.root_mean_squared_error()

np.float64(72499.66450467489)

### Let's do some tests on the regression and its parameters to analyse our linear regression. 

First out: let's do a significance test on the entire regression, a so called F-test

In [51]:
F, p = model.significance_regression()
p

np.float64(0.0)

Extremely strong proof against the null hypothesis. This result means that the regression is a real pattern in the population, not random.

In [49]:
R2 = model.r_squares()
R2

np.float64(0.6052570186069963)

Result of the statistical test R^2 can be between 0 and 1. Close to 1 meaning a lot of the variation in our predicted variable is actually explained by the modell. We got 0.6.


### Now let's test the significance of each coefficient/parameter/feature

In [48]:
t_test, p_values = model.t_test_coefficiants()

t_test_results = [
    (name, round(t, 4), round(p, 4))
    for name, t, p in zip(feature_names, t_test, p_values)
]

t_test_results

[('latitude', np.float64(0.5272), np.float64(0.5981)),
 ('housing_median_age', np.float64(1.9894), np.float64(0.0467)),
 ('total_bedrooms', np.float64(27.3847), np.float64(0.0)),
 ('median_income', np.float64(20.8888), np.float64(0.0)),
 ('ocean_INLAND', np.float64(138.076), np.float64(0.0)),
 ('ocean_ISLAND', np.float64(-50.7046), np.float64(0.0)),
 ('ocean_NEAR BAY', np.float64(5.7124), np.float64(0.0)),
 ('ocean_NEAR OCEAN', np.float64(4.9368), np.float64(0.0))]

Having the significance level (alpha) at 5%, all the chosen features, except latitude and (almost) housing_median_age, seem to be significant. This means they have an effect on the prediction.

### Now we want to see the Pearson number for every pair of features

In [46]:
corr_matrix = model.Pearson_correlation()

pearson_pairs = [
    (feature_names[i], feature_names[j], round(corr_matrix[i, j], 3))
    for i in range(len(feature_names))
    for j in range(i + 1, len(feature_names))
]

pearson_pairs


[('latitude', 'housing_median_age', np.float64(0.011)),
 ('latitude', 'total_bedrooms', np.float64(-0.066)),
 ('latitude', 'median_income', np.float64(-0.08)),
 ('latitude', 'ocean_INLAND', np.float64(0.351)),
 ('latitude', 'ocean_ISLAND', np.float64(-0.017)),
 ('latitude', 'ocean_NEAR BAY', np.float64(0.359)),
 ('latitude', 'ocean_NEAR OCEAN', np.float64(-0.161)),
 ('housing_median_age', 'total_bedrooms', np.float64(-0.319)),
 ('housing_median_age', 'median_income', np.float64(-0.119)),
 ('housing_median_age', 'ocean_INLAND', np.float64(-0.237)),
 ('housing_median_age', 'ocean_ISLAND', np.float64(0.017)),
 ('housing_median_age', 'ocean_NEAR BAY', np.float64(0.255)),
 ('housing_median_age', 'ocean_NEAR OCEAN', np.float64(0.022)),
 ('total_bedrooms', 'median_income', np.float64(-0.008)),
 ('total_bedrooms', 'ocean_INLAND', np.float64(-0.006)),
 ('total_bedrooms', 'ocean_ISLAND', np.float64(-0.004)),
 ('total_bedrooms', 'ocean_NEAR BAY', np.float64(-0.02)),
 ('total_bedrooms', 'ocean_NEA

### Let's check the confidence intervals for each feature

In [37]:
feature_names

['latitude',
 'housing_median_age',
 'total_bedrooms',
 'median_income',
 'ocean_INLAND',
 'ocean_ISLAND',
 'ocean_NEAR BAY',
 'ocean_NEAR OCEAN']

In [35]:
lower, upper = model.confidence_intervals()

np.column_stack((lower, upper))

array([[-2.12596932e+04,  3.21994149e+04],
       [-1.70142396e+02,  1.32406263e+03],
       [ 1.13515980e+03,  1.37090420e+03],
       [ 2.35085662e+01,  3.01225065e+01],
       [ 3.78937325e+04,  3.93345722e+04],
       [-7.40580660e+04, -6.68967949e+04],
       [ 1.01763436e+05,  2.68933850e+05],
       [ 4.66415283e+03,  1.48433187e+04],
       [ 1.39342817e+04,  2.21786832e+04]])