In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

1、读取数据集

In [2]:
listings = pd.read_csv('../data/listings.csv.gz')

2、数据集预处理

In [3]:
listings['price'] = listings.price.str.replace('$', '').str.replace(',', '')
listings['price'] = listings.price.astype(float)

In [4]:
amenities_count

NameError: name 'amenities_count' is not defined

In [5]:
listings.amenities = listings.amenities.str.replace('[{}]', '').str.replace('"', '')

In [6]:
listings.amenities = listings.amenities.map(lambda item: item.split(','))

In [7]:
amenities_count = pd.Series(np.concatenate(listings.amenities))

In [8]:
listings = listings.loc[(listings.price <= 600) & (listings.price > 0)]

In [9]:
columns = ['host_is_superhost', 'host_identity_verified', 'host_has_profile_pic',
           'is_location_exact', 'requires_license', 'instant_bookable', 
           'require_guest_profile_picture', 'require_guest_phone_verification'
]
for col in columns:
    listings[col] = listings[col].replace('f', 0, regex=True)
    listings[col] = listings[col].replace('t', 1, regex=True)

In [10]:
listings['security_deposit'] = listings['security_deposit'].fillna(value=0)
listings['security_deposit'] = listings['security_deposit'].replace('[\$,)]', '', regex=True)
listings['security_deposit'] = listings['security_deposit'].astype(float)

listings['cleaning_fee'] = listings['cleaning_fee'].fillna(0)
listings['cleaning_fee'] = listings['cleaning_fee'].replace('[\$,)]', '', regex=True)
listings['cleaning_fee'] = listings['cleaning_fee'].astype(float)

In [11]:
new_columns = columns + ['security_deposit', 'cleaning_fee', 'host_listings_count', 'host_total_listings_count', 'minimum_nights', 'bathrooms', 'bedrooms', 'guests_included', 'number_of_reviews', 'review_scores_rating', 'price']

In [12]:
listings_new = listings[new_columns]

In [13]:
for col in listings_new.columns[listings_new.isnull().any()]:
    print(col)

host_is_superhost
host_identity_verified
host_has_profile_pic
host_listings_count
host_total_listings_count
bathrooms
bedrooms
review_scores_rating


In [14]:
for col in listings_new.columns[listings_new.isnull().any()]:
    listings_new[col] = listings_new[col].fillna(listings_new[col].median())

In [15]:
# features = ['zipcode', 'property_type', 'room_type', 'cancellation_policy', 'neighbourhood_cleansed', 'bed_type']
features = ['property_type', 'room_type', 'cancellation_policy', 'neighbourhood_cleansed', 'bed_type']
for cat_feature in features:
    listings_new = pd.concat([listings_new, pd.get_dummies(listings[cat_feature])], axis=1)

In [16]:
listings_new = pd.concat([listings_new, pd.get_dummies(amenities_count)], axis=1, join='inner')

In [17]:
listings_new.head()

Unnamed: 0,host_is_superhost,host_identity_verified,host_has_profile_pic,is_location_exact,requires_license,instant_bookable,require_guest_profile_picture,require_guest_phone_verification,security_deposit,cleaning_fee,...,Wide clearance to shower,Wide doorway to guest bathroom,Wide entrance,Wide entrance for guests,Wide entryway,Wide hallways,Wifi,Window guards,translation missing: en.hosting_amenity_49,translation missing: en.hosting_amenity_50
0,1.0,0.0,1.0,0,0,1,0,0,200.0,60.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,1.0,1,0,1,0,0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
2,1.0,0.0,1.0,0,0,0,0,0,300.0,40.0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,1.0,1.0,1,0,1,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,1.0,1.0,1,0,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


#### 1、随机森林建模

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [19]:
y = listings_new.price
x = listings_new.drop('price', axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=22)

In [20]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
x_fit = std.fit(x_train)
x_train = x_fit.transform(x_train)
x_test = x_fit.transform(x_test)

In [21]:
rf = RandomForestRegressor(n_estimators=500, n_jobs=-1)
rf.fit(x_train, y_train)
y_test_predict = rf.predict(x_test)
rmse_rf = mean_squared_error(y_test, y_test_predict)**(1/2)

In [22]:
rmse_rf

61.06698944169245

In [23]:
r2_score(y_test, y_test_predict)

0.48341392950665707

#### 2、LightGBM建模----提升机器算法
pip install lightgbm

In [24]:
from lightgbm import LGBMRegressor

In [27]:
fit_params = {
    'early_stopping_rounds': 10, 
    'eval_metric': 'rmse', 
    'eval_set': [(x_test, y_test)],
    'eval_names': ['valid'],
    'verbose': 1,
}

In [28]:
lgb = LGBMRegressor(max_depth=20, learning_rate=0.01, n_estimators=1000)
lgb.fit(x_train, y_train, **fit_params)

[1]	valid's rmse: 84.6235	valid's l2: 7161.13
Training until validation scores don't improve for 10 rounds
[2]	valid's rmse: 84.284	valid's l2: 7103.8
[3]	valid's rmse: 83.9506	valid's l2: 7047.71
[4]	valid's rmse: 83.6222	valid's l2: 6992.67
[5]	valid's rmse: 83.301	valid's l2: 6939.06
[6]	valid's rmse: 82.9816	valid's l2: 6885.94
[7]	valid's rmse: 82.6622	valid's l2: 6833.04
[8]	valid's rmse: 82.3564	valid's l2: 6782.58
[9]	valid's rmse: 82.0438	valid's l2: 6731.18
[10]	valid's rmse: 81.7388	valid's l2: 6681.23
[11]	valid's rmse: 81.4416	valid's l2: 6632.73
[12]	valid's rmse: 81.1457	valid's l2: 6584.62
[13]	valid's rmse: 80.8568	valid's l2: 6537.83
[14]	valid's rmse: 80.5742	valid's l2: 6492.19
[15]	valid's rmse: 80.2925	valid's l2: 6446.88
[16]	valid's rmse: 80.0176	valid's l2: 6402.82
[17]	valid's rmse: 79.7452	valid's l2: 6359.3
[18]	valid's rmse: 79.4797	valid's l2: 6317.02
[19]	valid's rmse: 79.2178	valid's l2: 6275.46
[20]	valid's rmse: 78.9593	valid's l2: 6234.57
[21]	valid's

[174]	valid's rmse: 63.4001	valid's l2: 4019.58
[175]	valid's rmse: 63.3751	valid's l2: 4016.4
[176]	valid's rmse: 63.3447	valid's l2: 4012.55
[177]	valid's rmse: 63.3077	valid's l2: 4007.87
[178]	valid's rmse: 63.273	valid's l2: 4003.47
[179]	valid's rmse: 63.2371	valid's l2: 3998.94
[180]	valid's rmse: 63.2026	valid's l2: 3994.56
[181]	valid's rmse: 63.1719	valid's l2: 3990.69
[182]	valid's rmse: 63.1376	valid's l2: 3986.36
[183]	valid's rmse: 63.1108	valid's l2: 3982.97
[184]	valid's rmse: 63.0772	valid's l2: 3978.73
[185]	valid's rmse: 63.0451	valid's l2: 3974.69
[186]	valid's rmse: 63.0139	valid's l2: 3970.75
[187]	valid's rmse: 62.9822	valid's l2: 3966.76
[188]	valid's rmse: 62.9528	valid's l2: 3963.05
[189]	valid's rmse: 62.9196	valid's l2: 3958.87
[190]	valid's rmse: 62.889	valid's l2: 3955.03
[191]	valid's rmse: 62.8606	valid's l2: 3951.46
[192]	valid's rmse: 62.833	valid's l2: 3947.99
[193]	valid's rmse: 62.8058	valid's l2: 3944.57
[194]	valid's rmse: 62.7768	valid's l2: 3940

[352]	valid's rmse: 60.3523	valid's l2: 3642.4
[353]	valid's rmse: 60.3417	valid's l2: 3641.13
[354]	valid's rmse: 60.3328	valid's l2: 3640.05
[355]	valid's rmse: 60.3232	valid's l2: 3638.89
[356]	valid's rmse: 60.3176	valid's l2: 3638.22
[357]	valid's rmse: 60.308	valid's l2: 3637.06
[358]	valid's rmse: 60.2991	valid's l2: 3635.98
[359]	valid's rmse: 60.2907	valid's l2: 3634.97
[360]	valid's rmse: 60.2827	valid's l2: 3634.01
[361]	valid's rmse: 60.2724	valid's l2: 3632.76
[362]	valid's rmse: 60.2651	valid's l2: 3631.88
[363]	valid's rmse: 60.2595	valid's l2: 3631.21
[364]	valid's rmse: 60.25	valid's l2: 3630.06
[365]	valid's rmse: 60.2437	valid's l2: 3629.3
[366]	valid's rmse: 60.2356	valid's l2: 3628.33
[367]	valid's rmse: 60.2291	valid's l2: 3627.55
[368]	valid's rmse: 60.2194	valid's l2: 3626.38
[369]	valid's rmse: 60.2136	valid's l2: 3625.68
[370]	valid's rmse: 60.2047	valid's l2: 3624.61
[371]	valid's rmse: 60.1983	valid's l2: 3623.83
[372]	valid's rmse: 60.1899	valid's l2: 3622.

[530]	valid's rmse: 59.4	valid's l2: 3528.36
[531]	valid's rmse: 59.3953	valid's l2: 3527.8
[532]	valid's rmse: 59.3922	valid's l2: 3527.44
[533]	valid's rmse: 59.3896	valid's l2: 3527.13
[534]	valid's rmse: 59.3848	valid's l2: 3526.56
[535]	valid's rmse: 59.3819	valid's l2: 3526.22
[536]	valid's rmse: 59.3805	valid's l2: 3526.05
[537]	valid's rmse: 59.3778	valid's l2: 3525.72
[538]	valid's rmse: 59.3748	valid's l2: 3525.37
[539]	valid's rmse: 59.3738	valid's l2: 3525.25
[540]	valid's rmse: 59.3729	valid's l2: 3525.14
[541]	valid's rmse: 59.3708	valid's l2: 3524.89
[542]	valid's rmse: 59.3688	valid's l2: 3524.65
[543]	valid's rmse: 59.3653	valid's l2: 3524.24
[544]	valid's rmse: 59.3626	valid's l2: 3523.92
[545]	valid's rmse: 59.3598	valid's l2: 3523.59
[546]	valid's rmse: 59.3577	valid's l2: 3523.33
[547]	valid's rmse: 59.3523	valid's l2: 3522.7
[548]	valid's rmse: 59.3488	valid's l2: 3522.28
[549]	valid's rmse: 59.3492	valid's l2: 3522.33
[550]	valid's rmse: 59.3474	valid's l2: 3522.

[704]	valid's rmse: 59.115	valid's l2: 3494.59
[705]	valid's rmse: 59.1149	valid's l2: 3494.57
[706]	valid's rmse: 59.1136	valid's l2: 3494.41
[707]	valid's rmse: 59.113	valid's l2: 3494.35
[708]	valid's rmse: 59.1121	valid's l2: 3494.25
[709]	valid's rmse: 59.1115	valid's l2: 3494.16
[710]	valid's rmse: 59.1101	valid's l2: 3494
[711]	valid's rmse: 59.1102	valid's l2: 3494.02
[712]	valid's rmse: 59.1092	valid's l2: 3493.89
[713]	valid's rmse: 59.1081	valid's l2: 3493.77
[714]	valid's rmse: 59.1068	valid's l2: 3493.61
[715]	valid's rmse: 59.1036	valid's l2: 3493.24
[716]	valid's rmse: 59.1015	valid's l2: 3492.99
[717]	valid's rmse: 59.0985	valid's l2: 3492.63
[718]	valid's rmse: 59.0973	valid's l2: 3492.49
[719]	valid's rmse: 59.0969	valid's l2: 3492.44
[720]	valid's rmse: 59.0955	valid's l2: 3492.28
[721]	valid's rmse: 59.0948	valid's l2: 3492.19
[722]	valid's rmse: 59.0925	valid's l2: 3491.92
[723]	valid's rmse: 59.0898	valid's l2: 3491.6
[724]	valid's rmse: 59.0877	valid's l2: 3491.3

[878]	valid's rmse: 58.9722	valid's l2: 3477.72
[879]	valid's rmse: 58.9702	valid's l2: 3477.48
[880]	valid's rmse: 58.9684	valid's l2: 3477.27
[881]	valid's rmse: 58.9672	valid's l2: 3477.13
[882]	valid's rmse: 58.9644	valid's l2: 3476.8
[883]	valid's rmse: 58.9637	valid's l2: 3476.72
[884]	valid's rmse: 58.9641	valid's l2: 3476.76
[885]	valid's rmse: 58.9638	valid's l2: 3476.73
[886]	valid's rmse: 58.9623	valid's l2: 3476.55
[887]	valid's rmse: 58.9616	valid's l2: 3476.47
[888]	valid's rmse: 58.9599	valid's l2: 3476.28
[889]	valid's rmse: 58.9596	valid's l2: 3476.23
[890]	valid's rmse: 58.959	valid's l2: 3476.16
[891]	valid's rmse: 58.9597	valid's l2: 3476.25
[892]	valid's rmse: 58.9609	valid's l2: 3476.39
[893]	valid's rmse: 58.9626	valid's l2: 3476.59
[894]	valid's rmse: 58.9605	valid's l2: 3476.34
[895]	valid's rmse: 58.9607	valid's l2: 3476.37
[896]	valid's rmse: 58.9615	valid's l2: 3476.46
[897]	valid's rmse: 58.9629	valid's l2: 3476.62
[898]	valid's rmse: 58.9611	valid's l2: 34

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_depth=20,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [29]:
y_test_predict = lgb.predict(x_test)

In [30]:
r2_score(y_test, y_test_predict)

0.5184633174669431