In [2]:
# !conda install plotly

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [1]:
# 加载必要库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import plotly.express as px
from statsmodels.graphics.gofplots import qqplot
import seaborn as sns
%matplotlib inline

## 数据加载

In [3]:
# 加载数据
## 参考链接：https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html#sklearn.datasets.load_boston
# data_url = "http://lib.stat.cmu.edu/datasets/boston" # 数据来源
# raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None) # 用pandas读csv文件
# data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
# target = raw_df.values[1::2, 2]

In [3]:
house = pd.read_csv("boston.csv")
house.head() # 读前五行

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


特征：
- CRIM：按城镇划分的人均犯罪率
- ZN：划分为超过 25,000 平方英尺地块的住宅用地比例。
- INDUS：每个城镇的非零售商业用地的比例
- CHAS：Charles River 虚拟变量（如果区域以河流为界，则为 1；否则为 0）
- NOX：一氧化氮浓度（每 1000 万分之一）[parts/10M]
- RM：每户住宅的平均房间数
- AGE：1940 年之前建造的自住单元的比例
- DIS：到波士顿五个就业中心的加权距离
- RAD：通往径向高速公路的指数
- TAX：全每 10,000 美元的价值财产税率 [\$/10k]
- PTRATIO：按城镇划分的师生比例
- B：等式 $B=1000(Bk - 0.63)^2$ 的结果，其中 $Bk$ 是城镇中黑人的比例
- LSTAT：人口地位较低的百分比

标签：

MEDV：自住房屋的中位数价值 1000 美元 [k\$]

In [4]:
# 数据的规模
house.shape

(506, 14)

可以看到，本次我们用到的数据集总共有506条样本(行)也就是有506套房子的数据；同时有14列，其中13列是特征，也就是每个房子会有13个属性特征，例如：住宅的房间数、交通方便度等信息，最后一列是房子的标签——房价. 这个数据集主要的一个想法是，利用房子的13个属性特征，对房子的价格进行预测. 

In [5]:
# 对数据集数据的基本统计描述
# 这个命令非常便捷，呈现了这个数据集的基本统计信息，最大值，最小值，
house.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [None]:
# https://www.kaggle.com/code/weibbi/boston-housing-price-w-visual-eda-prediction
# https://www.kaggle.com/code/imakash3011/boston-house-prices-advanced-regression-technique
# plotly安装：https://zhuanlan.zhihu.com/p/328972379