In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

1、读取数据集

In [2]:
listings = pd.read_csv('../data/listings.csv.gz')

2、数据集预处理

In [3]:
listings['price'] = listings.price.str.replace('$', '').str.replace(',', '')
listings['price'] = listings.price.astype(float)

In [4]:
listings.amenities = listings.amenities.str.replace('[{}]', '').str.replace('"', '')

In [5]:
listings.amenities = listings.amenities.map(lambda item: item.split(','))

In [6]:
amenities_count = pd.Series(np.concatenate(listings.amenities))

In [7]:
listings = listings.loc[(listings.price <= 600) & (listings.price > 0)]

In [8]:
columns = ['host_is_superhost', 'host_identity_verified', 'host_has_profile_pic',
           'is_location_exact', 'requires_license', 'instant_bookable', 
           'require_guest_profile_picture', 'require_guest_phone_verification'
]
for col in columns:
    listings[col] = listings[col].replace('f', 0, regex=True)
    listings[col] = listings[col].replace('t', 1, regex=True)

In [9]:
listings['security_deposit'] = listings['security_deposit'].fillna(value=0)
listings['security_deposit'] = listings['security_deposit'].replace('[\$,)]', '', regex=True)
listings['security_deposit'] = listings['security_deposit'].astype(float)

listings['cleaning_fee'] = listings['cleaning_fee'].fillna(0)
listings['cleaning_fee'] = listings['cleaning_fee'].replace('[\$,)]', '', regex=True)
listings['cleaning_fee'] = listings['cleaning_fee'].astype(float)

In [10]:
new_columns = columns + ['security_deposit', 'cleaning_fee', 'host_listings_count', 'host_total_listings_count', 'minimum_nights', 'bathrooms', 'bedrooms', 'guests_included', 'number_of_reviews', 'review_scores_rating', 'price']

In [11]:
listings_new = listings[new_columns]

In [12]:
for col in listings_new.columns[listings_new.isnull().any()]:
    print(col)

host_is_superhost
host_identity_verified
host_has_profile_pic
host_listings_count
host_total_listings_count
bathrooms
bedrooms
review_scores_rating


In [13]:
for col in listings_new.columns[listings_new.isnull().any()]:
    listings_new[col] = listings_new[col].fillna(listings_new[col].median())

In [14]:
# features = ['zipcode', 'property_type', 'room_type', 'cancellation_policy', 'neighbourhood_cleansed', 'bed_type']
features = ['property_type', 'room_type', 'cancellation_policy', 'neighbourhood_cleansed', 'bed_type']
for cat_feature in features:
    listings_new = pd.concat([listings_new, pd.get_dummies(listings[cat_feature])], axis=1)

In [15]:
listings_new = pd.concat([listings_new, pd.get_dummies(amenities_count)], axis=1, join='inner')

In [16]:
listings_new.head()

Unnamed: 0,host_is_superhost,host_identity_verified,host_has_profile_pic,is_location_exact,requires_license,instant_bookable,require_guest_profile_picture,require_guest_phone_verification,security_deposit,cleaning_fee,...,Wide clearance to shower,Wide doorway to guest bathroom,Wide entrance,Wide entrance for guests,Wide entryway,Wide hallways,Wifi,Window guards,translation missing: en.hosting_amenity_49,translation missing: en.hosting_amenity_50
0,1.0,0.0,1.0,0,0,1,0,0,200.0,60.0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,1.0,1,0,1,0,0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
2,1.0,0.0,1.0,0,0,0,0,0,300.0,40.0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,1.0,1.0,1,0,1,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,1.0,1.0,1,0,0,0,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


#### 1、随机森林建模

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [18]:
y = listings_new.price
x = listings_new.drop('price', axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=22)

In [19]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
x_fit = std.fit(x_train)
x_train = x_fit.transform(x_train)
x_test = x_fit.transform(x_test)

In [20]:
rf = RandomForestRegressor(n_estimators=500, n_jobs=-1)
rf.fit(x_train, y_train)
y_test_predict = rf.predict(x_test)
rmse_rf = mean_squared_error(y_test, y_test_predict)**(1/2)

In [21]:
rmse_rf

60.78573912429199

In [22]:
r2_score(y_test, y_test_predict)

0.488161352740902

#### 2、LightGBM建模----提升机器算法
pip install lightgbm

- Api文档: https://lightgbm.readthedocs.io/en/latest/

In [23]:
from lightgbm import LGBMRegressor

In [25]:
fit_params = {
    'early_stopping_rounds': 10, 
    'eval_metric': 'rmse', 
    'eval_set': [(x_test, y_test)],
    'eval_names': ['valid'],
    'verbose': 100,
}

In [26]:
lgb = LGBMRegressor(max_depth=20, learning_rate=0.01, n_estimators=1000)
lgb.fit(x_train, y_train, **fit_params)

Training until validation scores don't improve for 10 rounds
[100]	valid's rmse: 66.9629	valid's l2: 4484.03
[200]	valid's rmse: 62.615	valid's l2: 3920.64
[300]	valid's rmse: 60.8635	valid's l2: 3704.37
[400]	valid's rmse: 59.999	valid's l2: 3599.89
[500]	valid's rmse: 59.5031	valid's l2: 3540.62
[600]	valid's rmse: 59.2559	valid's l2: 3511.26
[700]	valid's rmse: 59.121	valid's l2: 3495.29
[800]	valid's rmse: 59.0207	valid's l2: 3483.45
[900]	valid's rmse: 58.9611	valid's l2: 3476.42
Early stopping, best iteration is:
[890]	valid's rmse: 58.959	valid's l2: 3476.16


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.01, max_depth=20,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=1000, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [27]:
y_test_predict = lgb.predict(x_test)

In [28]:
r2_score(y_test, y_test_predict)

0.5184633174669431