## 读取数据并进行预处理

In [2]:
from sklearn.datasets import load_iris
data = load_iris()
X, y = data.data, data.target
# 展示数据集的形状以及数据类型
print(f"特征数据形状：{X.shape}，标签数据形状：{y.shape}")
print(f"特征数据类型：{X.dtype}，标签数据类型：{y.dtype}")

# 展示前5个样本的特征数据和标签
print(f"前5个样本的特征数据：{X[:5]}，前5个样本的标签：{y[0:5]}")

# 展示特征名和标签名
print(f"特征名称：{data.feature_names}，\n标签名称：{data.target_names}")

# 划分数据集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"训练集形状：X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"测试集形状：X_test: {X_test.shape}, y_test: {y_test.shape}")


特征数据形状：(150, 4)，标签数据形状：(150,)
特征数据类型：float64，标签数据类型：int64
前5个样本的特征数据：[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]，前5个样本的标签：[0 0 0 0 0]
特征名称：['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']，
标签名称：['setosa' 'versicolor' 'virginica']
训练集形状：X_train: (120, 4), y_train: (120,)
测试集形状：X_test: (30, 4), y_test: (30,)


## 模型选择

In [3]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
print(f"模型的超参数：{model.get_params()}")

模型的超参数：{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


## 训练与预测

In [4]:
# 训练模型
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)
print(y_pred[:5])  # 打印前5个预测结果

[1 0 2 1 1]


## 评估

In [5]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report

print("Precision:", precision_score(y_test, y_pred, average='macro'))
print("Recall:", recall_score(y_test, y_pred, average='macro'))
print("F1 Score:", f1_score(y_test, y_pred, average='macro'))
print("Accuracy:", accuracy_score(y_test, y_pred))

# 综合以上的部分报告
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Precision: 1.0
Recall: 1.0
F1 Score: 1.0
Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



## 提升模型性能

In [6]:
from sklearn.model_selection import GridSearchCV
# 超参数调优，这里选择50、100、200个树作为超参数
param_grid = {'n_estimators': [50, 100, 200]}

# 使用网格搜索进行超参数调优，通过5折交叉验证
grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
# 训练模型
grid.fit(X_train, y_train)

# 输出最佳超参数
print("最佳参数：", grid.best_params_)
print("最佳得分：", grid.best_score_)
best_model = grid.best_estimator_

最佳参数： {'n_estimators': 100}
最佳得分： 0.9583333333333334


In [7]:
y_pred = best_model.predict(X_test)

# 评估模型性能
print("最佳模型的精确率：", precision_score(y_test, y_pred, average='macro'))
print("最佳模型的召回率：", recall_score(y_test, y_pred, average='macro'))
print("最佳模型的F1分数：", f1_score(y_test, y_pred, average='macro'))
print("最佳模型的准确率：", accuracy_score(y_test, y_pred))

最佳模型的精确率： 1.0
最佳模型的召回率： 1.0
最佳模型的F1分数： 1.0
最佳模型的准确率： 1.0


## 保存模型

In [8]:
import joblib

# 保存之前训练的最佳模型
joblib.dump(best_model, 'best_model.pkl')

# 加载模型
loaded_model = joblib.load('best_model.pkl')

# 使用加载的模型进行预测
y_pred_loaded = loaded_model.predict(X_test)
print("加载模型的预测结果：", y_pred_loaded[:5])  # 打印前5个预测结果
print("加载模型的准确率：", accuracy_score(y_test, y_pred_loaded))

print("\nClassification Report:\n", classification_report(y_test, y_pred_loaded))


加载模型的预测结果： [1 0 2 1 1]
加载模型的准确率： 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



## sklearn实战：波士顿房价预测
本代码来自kaggle：https://www.kaggle.com/code/mariammostafa33/house-prices-advanced-regression#Random-Forest-Regression


In [None]:
import pandas as pd
import numpy as np

# 读取数据
train_data = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('house-prices-advanced-regression-techniques/test.csv')

# 提取训练集标签和测试集的 Id
y = train_data["SalePrice"]
test_ids = test_data["Id"]

# 删除不需要的列（Id， 标签列）
train_data.drop(["SalePrice", "Id"], axis=1, inplace=True)
test_data.drop("Id", axis=1, inplace=True)

# 对数化处理
y = np.log1p(y)
data = pd.concat([train_data, test_data], axis=0)

# 打印信息，初步查看信息的特征决定下一步需要做什么
data.head()
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 2919 entries, 0 to 1458
Data columns (total 79 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     2919 non-null   int64  
 1   MSZoning       2915 non-null   object 
 2   LotFrontage    2433 non-null   float64
 3   LotArea        2919 non-null   int64  
 4   Street         2919 non-null   object 
 5   Alley          198 non-null    object 
 6   LotShape       2919 non-null   object 
 7   LandContour    2919 non-null   object 
 8   Utilities      2917 non-null   object 
 9   LotConfig      2919 non-null   object 
 10  LandSlope      2919 non-null   object 
 11  Neighborhood   2919 non-null   object 
 12  Condition1     2919 non-null   object 
 13  Condition2     2919 non-null   object 
 14  BldgType       2919 non-null   object 
 15  HouseStyle     2919 non-null   object 
 16  OverallQual    2919 non-null   int64  
 17  OverallCond    2919 non-null   int64  
 18  YearBuilt    

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,2919.0,2433.0,2919.0,2919.0,2919.0,2919.0,2919.0,2896.0,2918.0,2918.0,...,2918.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0,2919.0
mean,57.137718,69.305795,10168.11408,6.089072,5.564577,1971.312778,1984.264474,102.201312,441.423235,49.582248,...,472.874572,93.709832,47.486811,23.098321,2.602261,16.06235,2.251799,50.825968,6.213087,2007.792737
std,42.517628,23.344905,7886.996359,1.409947,1.113131,30.291442,20.894344,179.334253,455.610826,169.205611,...,215.394815,126.526589,67.575493,64.244246,25.188169,56.184365,35.663946,567.402211,2.714762,1.314964
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,59.0,7478.0,5.0,5.0,1953.5,1965.0,0.0,0.0,0.0,...,320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,50.0,68.0,9453.0,6.0,5.0,1973.0,1993.0,0.0,368.5,0.0,...,480.0,0.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,80.0,11570.0,7.0,6.0,2001.0,2004.0,164.0,733.0,0.0,...,576.0,168.0,70.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1526.0,...,1488.0,1424.0,742.0,1012.0,508.0,576.0,800.0,17000.0,12.0,2010.0


In [48]:
# 数据预处理
def fill_missing(col):
    if col.dtype == 'object':
        return col.fillna(col.mode()[0] )
    else:
        return col.fillna(col.mean())
    
data=data.apply(fill_missing)
data.isnull().sum()

MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
SaleCondition    0
HouseAge         0
HasPool          0
HasGarage        0
HasBsmt          0
Length: 83, dtype: int64

In [None]:
from sklearn.preprocessing import OrdinalEncoder

OE=OrdinalEncoder()
obj_data=data.select_dtypes('object')
# 对这些类别型特征进行拟合并转换，将每个类别映射为一个整数（从0开始）
obj_data=OE.fit_transform(obj_data) 
data[data.select_dtypes('object').columns]=obj_data
data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,HouseAge,HasPool,HasGarage,HasBsmt
0,60,3.0,65.0,8450,1.0,0.0,3.0,3.0,0.0,4.0,...,2.0,0,2,2008,8.0,4.0,5,0,1,1
1,20,3.0,80.0,9600,1.0,0.0,3.0,3.0,0.0,2.0,...,2.0,0,5,2007,8.0,4.0,31,0,1,1
2,60,3.0,68.0,11250,1.0,0.0,0.0,3.0,0.0,4.0,...,2.0,0,9,2008,8.0,4.0,7,0,1,1
3,70,3.0,60.0,9550,1.0,0.0,0.0,3.0,0.0,0.0,...,2.0,0,2,2006,8.0,0.0,91,0,1,1
4,60,3.0,84.0,14260,1.0,0.0,0.0,3.0,0.0,2.0,...,2.0,0,12,2008,8.0,4.0,8,0,1,1


In [55]:
# 划分数据集
from sklearn.model_selection import train_test_split

X = data.iloc[:len(y), :] 
X_test = data.iloc[len(y):, :]  

x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# 查看划分后的数据集形状
print(f"训练集形状：x_train: {x_train.shape}, y_train: {y_train.shape}")
print(f"验证集形状：x_valid: {x_valid.shape}, y_valid: {y_valid.shape}")
print(f"测试集形状：X_test: {X_test.shape}")

训练集形状：x_train: (1168, 83), y_train: (1168,)
验证集形状：x_valid: (292, 83), y_valid: (292,)
测试集形状：X_test: (1459, 83)


### 选择模型
房价预测是一个回归问题，常见模型有：
- 线性回归（基准模型，简单可解释）
- Ridge/Lasso（带正则化，防止过拟合）
- 随机森林 / Gradient Boosting（XGBoost / LightGBM / CatBoost）（效果好，适合非线性关系）
- 神经网络（可以尝试，但需要更多调参）

可以先用简单模型（Ridge、随机森林）快速跑出 baseline，再用强模型（XGBoost/LightGBM）提升分数。

In [56]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 实例化线性回归模型
LR=LinearRegression()
# 训练模型
LR.fit(x_train,y_train)
# 预测
y_pred_lr=LR.predict(x_valid)
# 计算均方根误差（RMSE）
rmse_lr=np.sqrt(mean_squared_error(y_valid,y_pred_lr))
print(f"线性回归模型的RMSE：{rmse_lr}")

线性回归模型的RMSE：0.16575068239246504


In [59]:
from sklearn.linear_model import Ridge
# 实例化岭回归模型
ridge = Ridge(alpha=1.0)  # alpha是正则化参数
# 训练模型
ridge.fit(x_train, y_train)
# 预测
y_pred_ridge = ridge.predict(x_valid)
# 计算均方根误差（RMSE）
rmse_ridge = np.sqrt(mean_squared_error(y_valid, y_pred_ridge))
print(f"岭回归模型的RMSE：{rmse_ridge}")

岭回归模型的RMSE：0.15845075242802503


In [57]:
from sklearn.ensemble import RandomForestRegressor

# 实例化随机森林回归模型
RF = RandomForestRegressor(max_depth=100)
# 训练模型
RF.fit(x_train,y_train)
# 预测
y_pred_rf=RF.predict(x_valid)
# 计算均方根误差（RMSE）
rmse_rf=np.sqrt(mean_squared_error(y_valid,y_pred_rf))
print(f"随机森林回归模型的RMSE：{rmse_rf}")

随机森林回归模型的RMSE：0.14710417585780383


In [58]:
from xgboost import XGBRegressor

# 实例化XGBoost回归模型
xgb = XGBRegressor(
    n_estimators=1000, # n_estimators: 树的数量
    learning_rate=0.01, # learning_rate: 学习率
    max_depth=5, # max_depth: 树的最大深度
    subsample=0.7, # subsample: 用于训练的样本比例
    colsample_bytree=0.5, # colsample_bytree: 用于训练的特征比例
    random_state=42 # random_state: 随机种子
)

# 训练模型
xgb.fit(x_train,y_train)
# 预测
y_pred_xgb=xgb.predict(x_valid)
# 计算均方根误差（RMSE）
rmse_xgb=np.sqrt(mean_squared_error(y_valid,y_pred_xgb))
print(f"XGBoost回归模型的RMSE：{rmse_xgb}")

XGBoost回归模型的RMSE：0.13271680733542823


In [None]:
# 使用joblib保存模型
import joblib
# 保存之前训练的各个模型
joblib.dump(LR, 'house-prices-advanced-regression-techniques/lr_model.pkl')
joblib.dump(ridge, 'house-prices-advanced-regression-techniques/ridge_model.pkl')
joblib.dump(RF, 'house-prices-advanced-regression-techniques/rf_model.pkl')
joblib.dump(xgb, 'house-prices-advanced-regression-techniques/xgb_model.pkl')

# 使用pickle保存模型
import pickle
# 保存之前训练的各个模型
with open('house-prices-advanced-regression-techniques/lr_model.pkl', 'wb') as f:
    pickle.dump(LR, f)
with open('house-prices-advanced-regression-techniques/ridge_model.pkl', 'wb') as f:
    pickle.dump(ridge, f)
with open('house-prices-advanced-regression-techniques/rf_model.pkl', 'wb') as f:
    pickle.dump(RF, f)
with open('house-prices-advanced-regression-techniques/xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb, f)

['house-prices-advanced-regression-techniques/xgb_model.pkl']