In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

In [None]:
%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()
boston.keys()
data = boston.data
data.shape

In [None]:
target = boston.target

In [None]:
feature_names = boston.feature_names
feature_names
array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [None]:
for line in boston.DESCR.split('\n'):
  print(line)

In [None]:
X = pd.DataFrame(data, columns=feature_names)
X.head()
CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18.0	2.31	0.0	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.0	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.0	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.0	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.0	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33

In [None]:
y = pd.DataFrame(target, columns=['price'])
y.head()
price
0	24.0
1	21.6
2	34.7
3	33.4
4	36.2

In [None]:
from matplotlib import test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_names)

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, learning_rate=250, random_state=42)
X_train_tsne = tsne.fit_transform(X_train_scaled)
X_train_tsne

In [None]:
X_train_tsne.shape
(404, 2)

In [None]:
plt.scatter(X_train_tsne[:,0], X_train_tsne[:,1])

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
labels_train = kmeans.fit_predict(X_train_scaled)
pd.value_counts(labels_train)
0    191
2    127
1     86
dtype: int64

In [None]:
labels_test = kmeans.predict(X_test_scaled)

In [None]:
tsne = TSNE(n_components=3, learning_rate=100, random_state=42)
plt.scatter(X_train_tsne[:,0], X_train_tsne[:,1], c=labels_train)
plt.text(-20, 0, 'кластер 0')
plt.text(0, 10, 'кластер 1')
plt.text(10, 30, 'кластер 2')

In [None]:
y_train.mean()
price    22.796535
dtype: float64

In [None]:
y_train[labels_train==0].mean()
price    24.958115
dtype: float64

In [None]:
y_train[labels_train==1].mean()
price    27.788372
dtype: float64

In [None]:
y_train[labels_train==2].mean()
price    16.165354
dtype: float64

In [None]:
X_train.loc[labels_train==1, 'CRIM'].mean()
0.07356558139534884

In [None]:
X_train.loc[labels_train==0, 'CRIM'].mean()
0.4216602094240837

In [None]:
X_train.loc[labels_train==2, 'CRIM'].mean()
10.797028425196851