In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

#显示简体中文

plt.style.use({'figure.figsize':(25,20)})

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

housing_data = pd.read_csv('C:\\Users\\18929\\AppData\\Local\\Temp\\MicrosoftEdgeDownloads\\dcef67c2-18b2-4cd1-b3fe-7b4d89280626\\boston_house_price_english.csv')

#将特征矩阵X除去数据中medv这一行（房价）
X = housing_data.drop('medv',axis=1).values.astype(float)
#将目标值（标签）（房价），提取列
y = housing_data["medv"].values.astype(float)


#划分数据集（训练集、测试集）
def train_test_split(X, y, test_size=0.2,random_state=None):

# 设置随机种子

    if random_state is not None:
        np.random.seed(random_state)

    # 获取样本数量
    n_samples = X.shape[0]

    #打乱索引
    shuffled_indices = np.random.permutation(n_samples)

    #计算测试集大小
    test_set_size = int(n_samples*test_size)

    #划分索引
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]

    #划分数据集
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]

    return X_train,X_test,y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)


#自定义LinearRegression
class LinearRegression:
    #设置参数————学习率、迭代次数、权重、偏置、特征均值、特征标准差
    def __init__(self,learning_rate=0.01, n_iterations=100):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.w = None   #权重？ 斜率?
        self.b = 0      #偏置？ 截距？
        self.feature_mean = None    # 特征均值（用于标准化？）
        self.feature_std = None     # 特征标准差（用于标准化？）

    #自定义数据标准化
    def _normalize(self, X):
        #标准化特征
        return (X - self.feature_mean) / self.feature_std

    #w、b 修 正（梯度下降法求解）
    def fit(self,X,y):

        #数据标准化、转化为numpy数组
        self.feature_mean = np.mean(X,axis=0)
        self.feature_std = np.std(X,axis=0)
        #将特征矩阵标准化，有利于后续数据处理
        X_normalized = self._normalize(X)

        #初始化权重
        n_samples, n_features = X_normalized.shape
        #上面一行代码的意义是将X.shape的行数（表示样本的数量）赋值给n_samples，将X.shape的列数（表示特征的数量）赋值给n_features
        self.w = np.zeros(n_features)
        #将w设置为一个和特征数量相同的矩阵
        self.b = 0

        #梯度下降
        for _ in range(self.n_iterations):
            #预测值
            y_pred = np.dot(X_normalized, self.w) + self.b
            #np.dot用于计算X_normalized 点乘 w 的结果

        #计算梯度

            #权重梯度
            dw = (1 / n_samples) * np.dot(X_normalized.T, (y_pred - y))
            #X_normalized.T表示X_normalized矩阵的倒置
            #偏置梯度
            db = (1 / n_samples) * np.sum(y_pred - y)

            #更新参数
            # w = w - learning_rate(步长) * dw
            self.w -= self.learning_rate * dw
            self.b -= self.learning_rate * db

    def predict(self, X):

        X_normalized = self._normalize(X)
        return np.dot(X_normalized, self.w) + self.b

if __name__ == "__main__":

        # 转换为numpy数组
        X = housing_data.drop('medv',axis=1).values.astype(float)
        y = housing_data['medv'].values.astype(float)

        #生成随机数的种子
        np.random.seed(42)
        #np.random.permutation(len(X))用于生成一个随机排序的索引数组，长度与数据集x的样本数量相同。目的是随机打乱数据集的顺序
        indices = np.random.permutation(len(X))
        #计算训练集大小
        train_size = int(0.8 * len(X))
        train_idx, test_idx = indices[:train_size],indices[train_size:]
        X_train, y_train = X[train_idx],y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]

        #训练模型
        model = LinearRegression(learning_rate=0.01, n_iterations=1000)
        model.fit(X_train,y_train)

        predictions = model.predict(X_test)
        mse = np.mean((predictions - y_test) ** 2)
        print(f"模型权重（w）：\n{model.w}")
        print(f"模型偏置（b）：{model.b}")
        print(f"测试集均方误差：\n{mse:.2f}")


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\18929\\AppData\\Local\\Temp\\MicrosoftEdgeDownloads\\dcef67c2-18b2-4cd1-b3fe-7b4d89280626\\boston_house_price_english.csv'