In [5]:
import pandas as pd

url = "https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv"
df = pd.read_csv(url)

print(df.head())
print(df.info())


   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639

In [None]:
# ================================
# California Housing Preprocessing
# ================================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# --------------------------------
# 1. Load Original Dataset
# --------------------------------

df = pd.read_csv("student-mat.csv", delimiter=";")
print(df.columns) # G1-3 are targets; we only predict G3.

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')


In [None]:
# --------------------------------
# 2. Define Target (y) and Features (X)
# --------------------------------

y = df["G3"]   # final grade

# IMPORTANT:
# Drop G3 (target)
# Also drop G1 and G2 to avoid leakage (they are earlier grades)

X = df.drop(columns=["G1", "G2", "G3"])


# --------------------------------
# 4. One-Hot Encode Categorical Variables
# --------------------------------

X = pd.get_dummies(X, drop_first=True)


# --------------------------------
# 5. Train/Test Split
# --------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
# 39 columns are not bad at all


# --------------------------------
# Handle Missing Values (Median Imputation)
# --------------------------------
# Compute medians from training data ONLY

train_medians = X_train.median(numeric_only=True)

# Fill missing values
X_train = X_train.fillna(train_medians)
X_test = X_test.fillna(train_medians)


X_train shape: (296, 39)
X_test shape: (99, 39)
y_train shape: (296,)
y_test shape: (99,)


In [20]:
from sklearn.preprocessing import StandardScaler

# --------------------------------
# 6. Feature Scaling (Standardization)
# --------------------------------
# Only scale numerical columns (not dummy columns)

numerical_cols = [
    col for col in X_train.columns 
    if "ocean_proximity_" not in col
]

scaler = StandardScaler()

X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# --------------------------------
# 7. Check
# --------------------------------

print("\nFinal training shape:", X_train.shape)
print("Final test shape:", X_test.shape)

print("\nSample of processed training data:")
print(X_train.head())

print("\nTarget sample:")
print(y_train.head())



Final training shape: (296, 39)
Final test shape: (99, 39)

Sample of processed training data:
          age      Medu      Fedu  traveltime  studytime  failures    famrel  \
16  -0.578683  1.162655  1.358732   -0.613411   1.130799 -0.459199 -1.044767   
66  -1.364418  1.162655  1.358732   -0.613411   2.326214 -0.459199 -3.302074   
211  0.207052  1.162655  1.358732   -0.613411  -0.064617 -0.459199  1.212540   
7    0.207052  1.162655  1.358732    0.850860  -0.064617 -0.459199  0.083886   
19  -0.578683  1.162655  0.427749   -0.613411  -1.260033 -0.459199 -1.044767   

     freetime     goout      Dalc  ...  guardian_mother  guardian_other  \
16  -1.201583 -0.156106 -0.567091  ...         0.671551       -0.310316   
66  -0.229811 -0.156106  3.821376  ...         0.671551       -0.310316   
211 -0.229811  1.621103  2.724260  ...         0.671551       -0.310316   
7   -2.173356  0.732498 -0.567091  ...         0.671551       -0.310316   
19  -2.173356 -0.156106 -0.567091  ...        -1

In [21]:
# ==========================================
# Regression 
# ==========================================

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np
# random forest
from sklearn.ensemble import RandomForestRegressor

# --------------------------------
# 1. Initialize (Optimal) Models
# --------------------------------

from sklearn.model_selection import GridSearchCV

alpha_grid = {
    "alpha": np.logspace(-4, 3, 20)   # 0.0001 to 1000
}

ridge = Ridge()

ridge_grid = GridSearchCV(
    ridge,
    alpha_grid,
    cv=5,
    scoring="neg_mean_absolute_error",
    n_jobs=-1
)

ridge_grid.fit(X_train, y_train)
best_ridge = ridge_grid.best_estimator_
print("Best Ridge alpha:", ridge_grid.best_params_)


from sklearn.linear_model import RidgeCV, LassoCV

lasso = Lasso(max_iter=10000, tol=1e-4)
lasso_cv = LassoCV(
    alphas=np.logspace(-4, 3, 100),
    cv=4,
    max_iter=10000,
    n_jobs=-1,
    random_state=42
)

lasso_cv.fit(X_train, y_train)
best_lasso = Lasso(alpha=lasso_cv.alpha_, max_iter=20_000)
print("Best Lasso alpha:", lasso_cv.alpha_)

models = {
    "Linear Regression": LinearRegression(),
    "Ridge (Tuned)": best_ridge,
    "Lasso (Tuned)": best_lasso,
    "Decision Tree": DecisionTreeRegressor(max_depth=32, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=32, random_state=42),
}


# --------------------------------
# 2. Train
# --------------------------------

results = []

for name, model in models.items():
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Metrics
    # mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    results.append({
        "Model": name,
        # "MSE": mse,
        "MAE": mae
    })

# --------------------------------
# 3. Evaluate
# --------------------------------

results = []

for name, model in models.items():
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    # mse = mean_squared_error(y_test, y_pred)
    
    results.append({
        "Model": name,
        "MAE": mae,
        # "MSE": mse
    })

results_df = pd.DataFrame(results).sort_values(by="MAE")

print("\nModel Comparison (Sorted by MAE):\n")
print(results_df)

# --------------------------------
# 4. Show Results
# --------------------------------

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="MAE")

# print("\nModel Comparison (Sorted by MSE):\n")
print(results_df)

# the unit of MAE is US dollars. MAE preserves the interpretability.


Best Ridge alpha: {'alpha': 428.1332398719387}
Best Lasso alpha: 0.6579332246575675

Model Comparison (Sorted by MAE):

               Model       MAE
4      Random Forest  3.071515
3      Decision Tree  3.111111
0  Linear Regression  3.436654
1      Ridge (Tuned)  3.497791
2      Lasso (Tuned)  3.576290
               Model       MAE
4      Random Forest  3.071515
3      Decision Tree  3.111111
0  Linear Regression  3.436654
1      Ridge (Tuned)  3.497791
2      Lasso (Tuned)  3.576290
