In [37]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio

In [9]:
data = pd.read_csv('boston.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


In [10]:
data.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [11]:
data.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [26]:
pio.renderers.default = "browser"

In [None]:
fig_hist = make_subplots(rows=4, cols=4, subplot_titles=data.columns[:-1])
row, col = 1, 1
for col_name in data.columns[:-1]:
    fig_hist.add_trace(go.Histogram(
        x=data[col_name],
        name=col_name
    ), row=row, col=col)
    col += 1
    if col > 4:
        col = 1
        row += 1

fig_hist.update_layout(height=800, width=1000, title_text="uистограммы признаков")
fig_hist.show()


In [28]:
fig_box = make_subplots(rows=4, cols=4, subplot_titles=data.columns[:-1])
row, col = 1, 1
for col_name in data.columns[:-1]:
  fig_box.add_trace(go.Box(y=data[col_name], name=col_name), row=row, col=col)
  col += 1
  if col > 4:
    col = 1
    row += 1

fig_box.update_layout(height=800, width=1000, title_text="Boxplot признаков")
fig_box.show()

In [29]:
fig_medv = px.histogram(data, x='MEDV', nbins=50, title='Распределение цен на жильё (MEDV)')
fig_medv.update_layout(bargap=0.1)
fig_medv.show()

In [31]:
# Scatter plot: MEDV vs все признаки
fig_scatter = make_subplots(rows=4, cols=4, subplot_titles=[f'MEDV vs {col}' for col in data.columns[:-1]])
row, col = 1, 1
for feature in data.columns[:-1]:
    fig_scatter.add_trace(
        go.Scatter(x=data[feature], y=data['MEDV'], mode='markers', name=feature),
        row=row, col=col
    )
    col += 1
    if col > 4:
        col = 1
        row += 1

fig_scatter.update_layout(title_text="Зависимость MEDV от признаков")
fig_scatter.show()

In [32]:
# Тепловая карта корреляции
corr_matrix = data.corr()
fig_heatmap = px.imshow(corr_matrix, 
                        text_auto=True, 
                        color_continuous_scale='RdBu_r', 
                        title='Тепловая карта корреляции признаков')
fig_heatmap.update_layout(width=800, height=700)
fig_heatmap.show()

In [34]:
# Топ-10 корреляций с MEDV
print("="*60)
print("Топ-10 корреляций с MEDV:")
print("="*60)
medv_corr = corr_matrix['MEDV'].abs().sort_values(ascending=False)
print(medv_corr.head(11))
print("\n")

Топ-10 корреляций с MEDV:
MEDV       1.000000
LSTAT      0.737663
RM         0.695360
PTRATIO    0.507787
INDUS      0.483725
TAX        0.468536
NOX        0.427321
CRIM       0.388305
RAD        0.381626
AGE        0.376955
ZN         0.360445
Name: MEDV, dtype: float64




In [39]:
# Scatter plot с линией тренда для топ-3 коррелирующих признаков (кроме MEDV)
top_features = medv_corr.index[1:4]  

fig_top = make_subplots(rows=1, cols=3, subplot_titles=[f'MEDV vs {feat}' for feat in top_features])

for i, feat in enumerate(top_features, 1):
    fig = px.scatter(data, x=feat, y='MEDV', trendline='ols', title=f'MEDV vs {feat}')
    fig_top.add_trace(fig.data[0], row=1, col=i)
    if len(fig.data) > 1:
        fig_top.add_trace(fig.data[1], row=1, col=i)  # добавляем линию тренда

fig_top.update_layout(height=400, width=1200, title_text="Топ-3 признака vs MEDV с линией тренда")
fig_top.show()