#Simple Linear Regression
A practice of simple linear regression modeling using the housing dataset from sklearn.

#1. Import libraries

In [None]:
#Basic Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Dataset
from sklearn.datasets import fetch_california_housing

#Preprocessing
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler

#Modeling
from sklearn.model_selection import train_test_split #splitting dataset into training and testing sets
from sklearn.linear_model import LinearRegression #linear regression model
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score #evaluation metrics

#2. Load the dataset

In [None]:
df = fetch_california_housing(as_frame=True)

In [None]:
df.frame.head()

In [None]:
#define the x and y
#Passing all features for now, to conduct feature selection
x = df.data[['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup','Latitude', 'Longitude']]
y = df.target

#3. EDA

In [None]:
print(df.DESCR)

In [None]:
print(df.data.head())

In [None]:
print(df.target.head())

In [None]:
df.data.describe()

In [None]:
df.frame.hist(figsize=(12, 10), bins=30, edgecolor="black")
plt.subplots_adjust(hspace=0.7, wspace=0.4)

In [None]:
sns.scatterplot(
    data=df.frame,
    x="Longitude",
    y="Latitude",
    size="MedHouseVal",
    hue="MedHouseVal",
    palette="viridis",
    alpha=0.5,
)
plt.legend(title="Median House Value", bbox_to_anchor=(1.05, 0.95), loc="upper left")
_ = plt.title("Median house value depending of\n their spatial location")

In [None]:
# Drop the unwanted columns
columns_drop = ["Longitude", "Latitude"]
subset = df.frame.drop(columns=columns_drop)
# Quantize the target and keep the midpoint for each interval
# subset["MedHouseVal"] = pd.qcut(subset["MedHouseVal"], 6, retbins=False)
# subset["MedHouseVal"] = subset["MedHouseVal"].apply(lambda x: x.mid)
sns.pairplot(data=subset, hue="MedHouseVal", palette="viridis")

#4. Splitting the dataset

To avoid data leakage caused by using StandardScaler() and ANOVA, the dataset will be split into training and testing sets at this point

Therefore:
1. x and y will be passed into the function,
2. with a test size of 30% to ensure sufficiency in training and testing data,
3. and random state for reproducibility of results.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

#5. Preprocessing

Feature selection carried out first to reduce computational load when carrying out further preprocessing.

##5.1 Feature Selection

Select the one best feature (since this is a simple linear regression tutorial)

In [None]:
# Save original column names before feature selection
original_columns = X_train.columns

# Perform ANOVA feature selection on train only
anova_selector = SelectKBest(score_func=f_regression, k=1)
X_train_selected = anova_selector.fit_transform(X_train, y_train)
X_test_selected = anova_selector.transform(X_test)

# Get ANOVA F-scores and selected feature names
scores = anova_selector.scores_
selected_features = original_columns[anova_selector.get_support()]

# Create a DataFrame for ANOVA F-scores
scores_df = pd.DataFrame({'Feature': original_columns, 'ANOVA F-Score': scores})
scores_df = scores_df.sort_values(by='ANOVA F-Score', ascending=False)

# Create DataFrames for selected data
X_train_selected_df = pd.DataFrame(X_train_selected, columns=selected_features)
X_test_selected_df = pd.DataFrame(X_test_selected, columns=selected_features)

# Display results to assess consistency across all sets
print("\nANOVA F-Scores (training):\n", scores_df)
print("\nSelected Features:\n", selected_features)
print("\nTrain Selected Data Sample:\n", X_train_selected_df.head())
print("\nTest Selected Data Sample:\n", X_test_selected_df.head())

#6. Modeling

In [None]:
#create the model
model = LinearRegression()

In [None]:
#train the model
model.fit(X_train_selected, y_train)

In [None]:
predicted_outcome = model.predict(X_test_selected)

In [None]:
# Predict test metrics on Random Forest Regressor
y_pred = model.predict(X_test_selected)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

print("Simple Linear Regression Test Metrics:")
print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2) Score:", r2)
print("Root Mean Squared Error (RMSE):", rmse)

#7. Visualization

In [None]:
# Step 7: Plot the Results
plt.scatter(X_test_selected, y_test, color='blue', label='Data points')
plt.plot(X_test_selected, predicted_outcome, color='red', linewidth=2, label='Regression line')
plt.xlabel('Median Income')
plt.ylabel('Median House Value')
plt.legend()
plt.show()