# Lasso 回归
增加了L1正则化项 使得小作用β为0

Lasso回归：
在OLS的基础上，加上一个 $$L1 $$正则化项：
$$J_{\text{Lasso}}(\mathbf{w}) = \sum_{i=1}^n (y_i - \mathbf{x}_i^\top \mathbf{w})^2 + \lambda \sum_{j=1}^p |w_j|$$

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

# 加载数据
from sklearn.datasets import fetch_california_housing
data = pd.read_csv("./california_housing.csv")
df = data
# df['MedHouseVal'] = data.target

# 初步查看数据
print(df.head())
print(df.describe())

   Unnamed: 0  longitude  latitude  housing_median_age  total_rooms  \
0           0    -122.23     37.88                41.0        880.0   
1           1    -122.22     37.86                21.0       7099.0   
2           2    -122.24     37.85                52.0       1467.0   
3           3    -122.25     37.85                52.0       1274.0   
4           4    -122.25     37.85                52.0       1627.0   

   total_bedrooms  population  households  median_income  median_house_value  \
0           129.0       322.0       126.0         8.3252            452600.0   
1          1106.0      2401.0      1138.0         8.3014            358500.0   
2           190.0       496.0       177.0         7.2574            352100.0   
3           235.0       558.0       219.0         5.6431            341300.0   
4           280.0       565.0       259.0         3.8462            342200.0   

  ocean_proximity  
0        NEAR BAY  
1        NEAR BAY  
2        NEAR BAY  
3        NEA

In [13]:
# 检查缺失值
print(df.isnull().sum())

# 分离特征和目标变量
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# 数据分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 特征缩放
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Unnamed: 0              0
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64


KeyError: "['MedHouseVal'] not found in axis"

In [3]:
# 特征相关性
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.show()

NameError: name 'df' is not defined

<Figure size 1000x800 with 0 Axes>

In [4]:
# 建立 Lasso 模型
lasso = Lasso()

# 定义超参数网格
param_grid = {'alpha': np.logspace(-4, 4, 50)}

# 网格搜索
grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# 最佳超参数
best_alpha = grid_search.best_params_['alpha']
print(f'Best alpha: {best_alpha}')

# 训练最终模型
lasso_opt = Lasso(alpha=best_alpha)
lasso_opt.fit(X_train_scaled, y_train)

NameError: name 'X_train_scaled' is not defined

In [5]:
# 预测
y_pred_train = lasso_opt.predict(X_train_scaled)
y_pred_test = lasso_opt.predict(X_test_scaled)

# 评估
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

print(f'MSE (Train): {mse_train}')
print(f'MSE (Test): {mse_test}')
print(f'R^2 (Train): {r2_train}')
print(f'R^2 (Test): {r2_test}')

NameError: name 'lasso_opt' is not defined

In [6]:
# 可视化
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_test, alpha=0.6, color='b')
plt.plot([0, 5], [0, 5], 'r--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted House Prices')
plt.show()

NameError: name 'y_test' is not defined

<Figure size 1000x600 with 0 Axes>