# 读取数据

In [12]:
import pandas as pd
import numpy as np
pd.set_option('max_rows', None)
pd.set_option('max_columns', None)

In [13]:
housing=pd.read_csv('Ziroom_shanghai.csv')

In [14]:
print(housing.shape)
housing.drop('Unnamed: 0',axis=1,inplace=True)
housing.head()

(1290, 10)


Unnamed: 0,title,type,metro_distance,square,price,floor,floors,district,station
0,合租·地杰国际城F欧香四季4居室-南卧,合租,191,18,3560,14,18,浦东新区,御桥
1,整租·华升公寓2室1厅-南,整租,150,96,9460,20,34,杨浦区,江浦路
2,合租·博爱家园4居室-南卧,合租,234,17,3290,6,6,浦东新区,高科西路
3,合租·紫叶花园东园4居室-南卧,合租,216,17,3030,5,6,浦东新区,北蔡
4,合租·仁和苑4居室-南卧,合租,243,17,3990,7,12,杨浦区,江湾体育场


In [15]:
housing_num = housing[['metro_distance','square','price']]
housing_num.head()

Unnamed: 0,metro_distance,square,price
0,191,18,3560
1,150,96,9460
2,234,17,3290
3,216,17,3030
4,243,17,3990


In [16]:
housing_cat = housing[['district','type']]
housing_cat.head()

Unnamed: 0,district,type
0,浦东新区,合租
1,杨浦区,整租
2,浦东新区,合租
3,浦东新区,合租
4,杨浦区,合租


# 聚类分析

**- 流水线特征缩放**

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler())])

In [18]:
ss=StandardScaler()
housing_num_tr = ss.fit_transform(housing_num)
ss.inverse_transform(housing_num_tr,copy=True)[:5]

array([[ 191.,   18., 3560.],
       [ 150.,   96., 9460.],
       [ 234.,   17., 3290.],
       [ 216.,   17., 3030.],
       [ 243.,   17., 3990.]])

**- 数值&类别流水线**

In [20]:
from sklearn.compose import ColumnTransformer

In [21]:
num_attribs = list(housing_num)
cat_attribs = ['district','type']

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing).toarray()
housing_prepared

array([[-1.16786351,  0.06475401,  0.45139146, ...,  0.        ,
         1.        ,  0.        ],
       [-1.2894726 ,  7.44363124,  5.90315914, ...,  0.        ,
         0.        ,  1.        ],
       [-1.04032226, -0.02984698,  0.20190379, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.32253197, -0.31364995,  0.63619715, ...,  0.        ,
         1.        ,  0.        ],
       [-1.18862604, -0.50285193, -0.13074644, ...,  0.        ,
         1.        ,  0.        ],
       [-0.25727831, -0.50285193, -0.26011042, ...,  0.        ,
         1.        ,  0.        ]])

In [22]:
from sklearn.cluster import KMeans # 导入Kmeans
k=5 # 聚类类别数
kmodel = KMeans(n_clusters=k,n_jobs=4)
kmodel.fit(housing_prepared)



KMeans(n_clusters=5, n_jobs=4)

In [23]:
ss=StandardScaler()
housing_num_tr = ss.fit_transform(housing_num)
ss.inverse_transform(housing_num_tr,copy=True)[:5]

array([[ 191.,   18., 3560.],
       [ 150.,   96., 9460.],
       [ 234.,   17., 3290.],
       [ 216.,   17., 3030.],
       [ 243.,   17., 3990.]])

**- 聚类结果前三列**

In [91]:
from sklearn.cluster import KMeans # 导入Kmeans
k=5 # 聚类类别数
kmodel = KMeans(n_clusters=k, n_jobs=4)
kmodel.fit(housing_prepared)



KMeans(n_clusters=3, n_jobs=4)

In [25]:
kmodel.cluster_centers_[0:5,:3]

array([[ 1.62129282,  0.06600424, -0.48720974],
       [-0.51512862, -0.22916671, -0.73526856],
       [-0.34032843,  7.79247239,  5.09578931],
       [-0.78067842, -0.15743048,  0.60188323],
       [ 0.39247822,  0.01417526,  0.26850703]])

**- 聚类结果反特征缩放**

In [26]:
ss.inverse_transform(kmodel.cluster_centers_[0:5,:3],copy=True)

array([[1131.35242291,   18.01321586, 2544.22907489],
       [ 411.06684492,   14.89304813, 2275.77540107],
       [ 470.        ,   99.6875    , 8586.25      ],
       [ 321.53783784,   15.65135135, 3722.86486486],
       [ 717.06270627,   17.46534653, 3362.07920792]])

三项分别为：离地铁的距离，房屋面积，租房价格

**- 将聚类类别插入df**

In [29]:
from pandas import Series
housing_data = housing.copy()

label = Series(kmodel.labels_,index=housing_data.index)
housing_data.insert(9,'聚类类别',label)
housing_data.head()

Unnamed: 0,title,type,metro_distance,square,price,floor,floors,district,station,聚类类别
0,合租·地杰国际城F欧香四季4居室-南卧,合租,191,18,3560,14,18,浦东新区,御桥,3
1,整租·华升公寓2室1厅-南,整租,150,96,9460,20,34,杨浦区,江浦路,2
2,合租·博爱家园4居室-南卧,合租,234,17,3290,6,6,浦东新区,高科西路,3
3,合租·紫叶花园东园4居室-南卧,合租,216,17,3030,5,6,浦东新区,北蔡,3
4,合租·仁和苑4居室-南卧,合租,243,17,3990,7,12,杨浦区,江湾体育场,3


**- 转换为房屋等级**

In [30]:
c = pd.DataFrame(kmodel.cluster_centers_).sort_values(2)
c['房屋等级']=['交通不便普通','交通一般普通','交通便利普通','高端房源','豪宅']

In [32]:
labels = c
expensive = labels[labels['房屋等级']=='豪宅'].index[0]
high = labels[labels['房屋等级']=='高端房源'].index[0]
mid = labels[labels['房屋等级']=='交通便利普通'].index[0]
normal = labels[labels['房屋等级']=='交通一般普通'].index[0]
low = labels[labels['房屋等级']=='交通不便普通'].index[0]

In [33]:
kmeans_type_shift = {
    expensive:'豪宅',
    high:'高端房源',
    mid:'交通便利普通',
    normal:'交通一般普通',
    low:'交通不便普通'
}
housing_data['聚类类别'] = housing_data['聚类类别'].map(kmeans_type_shift)

In [38]:
housing_data=housing_data[['district','type','metro_distance','square','price','聚类类别']]
housing_data.head(10)

Unnamed: 0,district,type,metro_distance,square,price,聚类类别
0,浦东新区,合租,191,18,3560,高端房源
1,杨浦区,整租,150,96,9460,豪宅
2,浦东新区,合租,234,17,3290,高端房源
3,浦东新区,合租,216,17,3030,高端房源
4,杨浦区,合租,243,17,3990,高端房源
5,静安区,合租,686,19,3490,交通便利普通
6,虹口区,合租,444,18,3290,高端房源
7,浦东新区,合租,197,19,2630,交通不便普通
8,松江区,合租,150,21,2460,交通不便普通
9,松江区,合租,150,26,3960,高端房源


In [39]:
housing_data.to_csv('housing_type.csv')

**- 上海各行政区房源**

In [40]:
#grouped_house
grouped_house = housing_data[['metro_distance','square','price']].groupby(housing_data['聚类类别'])

In [41]:
grouped_house.mean().sort_values(by='price',ascending=True).round(2)

Unnamed: 0_level_0,metro_distance,square,price
聚类类别,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
交通不便普通,411.07,14.89,2275.78
交通一般普通,1131.35,18.01,2544.23
交通便利普通,716.23,17.46,3361.64
高端房源,321.15,15.65,3724.2
豪宅,470.0,99.69,8586.25


In [43]:
housing_data['district'].value_counts() # 各区房源总数

浦东新区    364
普陀区     163
闵行区     138
宝山区     103
嘉定区      85
长宁区      85
徐汇区      69
松江区      62
杨浦区      61
静安区      59
虹口区      47
黄浦区      38
青浦区      16
Name: district, dtype: int64

In [44]:
low_type = housing_data[housing_data['聚类类别']=='交通不便普通']['district'].value_counts()
normal_type = housing_data[housing_data['聚类类别']=='交通一般普通']['district'].value_counts()
mid_type = housing_data[housing_data['聚类类别']=='交通便利普通']['district'].value_counts()
high_type = housing_data[housing_data['聚类类别']=='高端房源']['district'].value_counts()
expensive_type = housing_data[housing_data['聚类类别']=='豪宅']['district'].value_counts()

In [45]:
low_type = Series(low_type)
normal_type = Series(normal_type)
mid_type = Series(mid_type)
high_type = Series(high_type)
expensive_type = Series(expensive_type)

In [47]:
temp_pd = pd.DataFrame(housing_data['district'].value_counts())

In [48]:
# '交通不便普通','交通一般普通','交通便利普通','高端房源','豪宅'
temp_pd.insert(1,'交通不便普通',low_type)
temp_pd.insert(2,'交通一般普通',normal_type)
temp_pd.insert(3,'交通便利普通',mid_type)
temp_pd.insert(4,'高端房源',high_type)
temp_pd.insert(5,'豪宅',expensive_type)

In [49]:
shanghai_housing = temp_pd.fillna(0).astype(int)

In [50]:
shanghai_housing

Unnamed: 0,district,交通不便普通,交通一般普通,交通便利普通,高端房源,豪宅
浦东新区,364,91,65,116,91,1
普陀区,163,18,16,39,87,3
闵行区,138,75,39,12,10,2
宝山区,103,60,36,5,0,2
嘉定区,85,53,29,1,0,2
长宁区,85,0,5,45,34,1
徐汇区,69,1,2,17,48,1
松江区,62,43,15,2,2,0
杨浦区,61,6,10,19,25,1
静安区,59,3,4,28,22,2


## 占比分析

**- 上海各行政区房源分析**

In [51]:
housing_rate = shanghai_housing.copy()

In [52]:
housing_rate.insert(6,'豪宅占比',(shanghai_housing['豪宅']/shanghai_housing['district']).round(3))
housing_rate.insert(7,'高端房源占比',(shanghai_housing['高端房源']/shanghai_housing['district']).round(3))
housing_rate.insert(8,'交通便利普通房源占比',(shanghai_housing['交通便利普通']/shanghai_housing['district']).round(3))
housing_rate.insert(9,'交通一般普通房源占比',(shanghai_housing['交通一般普通']/shanghai_housing['district']).round(3))
housing_rate.insert(10,'交通不便普通房源占比',(shanghai_housing['交通不便普通']/shanghai_housing['district']).round(3))
housing_rate

Unnamed: 0,district,交通不便普通,交通一般普通,交通便利普通,高端房源,豪宅,豪宅占比,高端房源占比,交通便利普通房源占比,交通一般普通房源占比,交通不便普通房源占比
浦东新区,364,91,65,116,91,1,0.003,0.25,0.319,0.179,0.25
普陀区,163,18,16,39,87,3,0.018,0.534,0.239,0.098,0.11
闵行区,138,75,39,12,10,2,0.014,0.072,0.087,0.283,0.543
宝山区,103,60,36,5,0,2,0.019,0.0,0.049,0.35,0.583
嘉定区,85,53,29,1,0,2,0.024,0.0,0.012,0.341,0.624
长宁区,85,0,5,45,34,1,0.012,0.4,0.529,0.059,0.0
徐汇区,69,1,2,17,48,1,0.014,0.696,0.246,0.029,0.014
松江区,62,43,15,2,2,0,0.0,0.032,0.032,0.242,0.694
杨浦区,61,6,10,19,25,1,0.016,0.41,0.311,0.164,0.098
静安区,59,3,4,28,22,2,0.034,0.373,0.475,0.068,0.051


In [117]:
data1 = housing_rate['豪宅占比'].values
data2 = housing_rate['高端房源占比'].values
data3 = housing_rate['交通便利普通房源占比'].values
data4 = housing_rate['交通一般普通房源占比'].values
data5 = housing_rate['交通不便普通房源占比'].values
print(data1)
print(data2)
print(data3)
print(data4)
print(data5)

[0.003 0.018 0.014 0.019 0.024 0.012 0.014 0.    0.016 0.034 0.    0.026
 0.   ]
[0.25  0.534 0.072 0.    0.    0.4   0.696 0.032 0.41  0.373 0.383 0.842
 0.   ]
[0.319 0.239 0.087 0.049 0.012 0.529 0.246 0.032 0.311 0.475 0.319 0.132
 0.   ]
[0.179 0.098 0.283 0.35  0.341 0.059 0.029 0.242 0.164 0.068 0.128 0.
 0.   ]
[0.25  0.11  0.543 0.583 0.624 0.    0.014 0.694 0.098 0.051 0.17  0.
 1.   ]


In [139]:
pyecharts.charts import Bar
from pyecharts import options as opts

bar=Bar()
bar.add_xaxis(["浦东新区", "普陀区", "闵行区", "宝山区", "嘉定区", "长宁区", "徐汇区", "松江区", "杨浦区", "静安区", "虹口区", "黄浦区","青浦区"])

bar.add_yaxis('交通便利普通房源占比',[0.319, 0.239, 0.087, 0.049, 0.012, 0.529, 0.246, 0.032, 0.311,
       0.475, 0.319, 0.132, 0.   ],stack='stack1')
bar.add_yaxis('交通一般普通房源占比',[0.179, 0.098, 0.283, 0.35 , 0.341, 0.059, 0.029, 0.242, 0.164,
       0.068, 0.128, 0.   , 0.   ],stack='stack1')
bar.add_yaxis('交通不便普通房源占比',[0.25 , 0.11 , 0.543, 0.583, 0.624, 0.   , 0.014, 0.694, 0.098,
       0.051, 0.17 , 0.   , 1.   ],stack='stack1')
bar.add_yaxis('豪宅占比',[0.003, 0.018, 0.014, 0.019, 0.024, 0.012, 0.014, 0.   , 0.016,
       0.034, 0.   , 0.026, 0.   ],stack='stack1')
bar.add_yaxis('高端房源占比',[0.25,0.534,0.072, 0. , 0.  ,  0.4 ,  0.696 ,0.032, 0.41 , 0.373 ,0.383, 0.842,
 0.   ],stack='stack1')

bar.set_series_opts(label_opts=opts.LabelOpts(is_show=False),
                    markpoint_opts=opts.MarkPointOpts(
                    data=[opts.MarkPointItem(type_='max',name='最大值'),
                        opts.MarkPointItem(type_='min',name='最小值'),
                        opts.MarkPointItem(type_='average',name='平均值')]))
bar.set_global_opts(title_opts=opts.TitleOpts(title='上海各区房源'))
bar.render()
#上海各区房源占比分析

'/Users/jazmin/render.html'