In [1]:
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

# データ読み込み

In [2]:
train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")
stations = pd.read_csv("station_list.csv")

In [3]:
#欠損値削除
train["reviews_per_month"] = train["reviews_per_month"].fillna(0)
train["last_review"] = train["last_review"].fillna("1900-01-01")
test["reviews_per_month"] = test["reviews_per_month"].fillna(0)
test["last_review"] = test["last_review"].fillna("1900-01-01")

In [4]:
!station_distance.py

distance.csv already existed


In [5]:
distance = pd.read_csv("distance.csv")
#adversarial validationのためのラベル付け
train["label"] = 1
test["label"] = 0
all_data = pd.concat([train,test],axis=0)
all_data["near_station"] = distance["near_station"]
all_data["dist_from_sta"] = distance["dist_from_sta"]
train.drop("label",axis=1,inplace=True)
test.drop("label",axis=1,inplace=True)

# 特徴量追加

In [6]:
#180日以上かどうか
train["180_over"] = (train["availability_365"] > 180).astype(int)
test["180_over"] = (test["availability_365"] > 180).astype(int)
all_data["180_over"] = (all_data["availability_365"] > 180).astype(int)
model = lgb.LGBMRegressor()
one_hot_list = ["room_type","neighbourhood"]

In [7]:
all_data.head()

Unnamed: 0,id,name,host_id,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,availability_365,y,label,near_station,dist_from_sta,180_over
0,1,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...,242899459,Koto Ku,35.68185,139.8031,Entire home/apt,1,55,2020-04-25,2.21,173,12008.0,1,清澄白河,385.676932,0
1,2,Downtown Tokyo Iriya next to Ueno,308879948,Taito Ku,35.72063,139.78536,Entire home/apt,6,72,2020-03-25,2.11,9,6667.0,1,入谷,155.380783,0
2,3,"Japan Style,Private,Affordable,4min to Sta.",300877823,Katsushika Ku,35.74723,139.82349,Entire home/apt,1,18,2020-03-23,3.46,288,9923.0,1,堀切菖蒲園,369.710584,1
3,4,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi,236935461,Shibuya Ku,35.68456,139.68077,Entire home/apt,1,2,2020-04-02,1.76,87,8109.0,1,初台,626.117356,0
4,5,LICENSED SHINJUKU HOUSE: Heart of the action!,243408889,Shinjuku Ku,35.6984,139.70467,Entire home/apt,1,86,2020-01-30,2.0,156,100390.0,1,東新宿,265.953843,0


# 特徴量削除

In [8]:
id = test["id"]
drop_list = ["id","host_id","latitude","longitude","180_over"]
train.drop(drop_list,axis=1,inplace=True)
test.drop(drop_list,axis=1,inplace=True)
all_data.drop(["y"],axis=1,inplace=True)
all_data.drop(drop_list,axis=1,inplace=True)

In [9]:
all_data.columns

Index(['name', 'neighbourhood', 'room_type', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'availability_365', 'label', 'near_station', 'dist_from_sta'],
      dtype='object')

# Adversarial validation

In [10]:
#one_hot_list = ["room_type","neighbourhood",'near_station']
all_data = pd.get_dummies(all_data,columns=one_hot_list)

In [11]:
X_all = all_data.drop(["label"],axis=1).select_dtypes(exclude="object")
y_all = all_data["label"]
X_train_adv, X_valid_adv, y_train_adv, y_valid_adv = train_test_split(X_all, y_all, test_size=0.33, random_state=42, shuffle=True)
#model = RFR()
model.fit(X_train_adv, y_train_adv)
y_pred = model.predict(X_valid_adv)
current_roc_auc_score = roc_auc_score(y_valid_adv,y_pred)
print(current_roc_auc_score)

0.7216248023372918


In [12]:
for column in all_data.drop("label",axis=1).select_dtypes(exclude="object").columns:
    X_all = all_data.drop(["label",column],axis=1).select_dtypes(exclude="object")
    y_all = all_data["label"]
    X_train_adv, X_valid_adv, y_train_adv, y_valid_adv = train_test_split(X_all, y_all, test_size=0.33, random_state=42, shuffle=True)
    scores = []
    kf = KFold(n_splits=5, shuffle=True, random_state=1)
    for tr_idx, va_idx in kf.split(X_all):
        tr_x, va_x = X_all.iloc[tr_idx], X_all.iloc[va_idx]
        tr_y, va_y = y_all.iloc[tr_idx], y_all.iloc[va_idx]
        model.fit(tr_x, tr_y)
        va_pred = model.predict(va_x)
        roc_auc = roc_auc_score(va_y,va_pred)
        scores.append(roc_auc)
    # クロスバリデーションの平均のスコアを出力する
    if current_roc_auc_score - 0.01 > np.mean(scores):
        print("{0}:".format(column),np.mean(scores))


minimum_nights: 0.7044952588217919
availability_365: 0.687978640043642


# name編集

In [10]:
print(all_data[all_data["name"].duplicated()].shape[0])
all_data[all_data["name"].duplicated()].head()

1192


Unnamed: 0,name,neighbourhood,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,availability_365,label,near_station,dist_from_sta
158,Shin-Ōkubo Station 10min/Cozy&Clean/Free-Wifi,Shinjuku Ku,Private room,1,3,2020-01-22,0.48,259,1,西早稲田,300.421608
427,[Male Only] RUMa INN NishiSugamo☆2019 opened,Toshima Ku,Shared room,1,0,1900-01-01,0.0,248,1,新庚申塚,271.59124
485,JR YAMANOTE Line Ueno/Shinjuku Directly! MAX 4ppl,Arakawa Ku,Private room,1,5,2020-03-20,1.09,337,1,西日暮里,317.00275
523,15mins to Shinjuku/Ikebukuro/Ginza/Tokyo! 3ppl,Shinjuku Ku,Entire home/apt,1,3,2020-02-18,0.94,301,1,都庁前,316.946528
538,newly renovate house near Ikebukuro STa,Toshima Ku,Entire home/apt,12,15,2020-03-18,2.85,270,1,池袋,686.988343


In [11]:
corpus = []
#
#name_df = all_data[all_data["name"].str.contains("[a-z]",regex=True) & ~all_data["name"].str.contains("[一-龥ぁ-んァ-ン]",regex=True)]["name"].str.lower()
name_df = all_data["name"].str.lower()
name_df = name_df.str.replace('[0-9!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〜★〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]'," ",regex=True).str.replace("   "," ").str.replace("  "," ")
name_df.apply(lambda x: corpus.append(x))
#コーパスの作成
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
name_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names()).iloc[:,:]
name_df.shape



(14986, 7729)

In [12]:
total_data = pd.concat([train,test],axis=0).reset_index()
total_data = pd.concat([total_data,name_df],axis=1)
total_data.shape
train = total_data.iloc[:train.shape[0],:]
test = total_data.iloc[train.shape[0]:,:]

In [13]:
all_data.head()

Unnamed: 0,name,neighbourhood,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,availability_365,label,near_station,dist_from_sta
0,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...,Koto Ku,Entire home/apt,1,55,2020-04-25,2.21,173,1,清澄白河,385.676932
1,Downtown Tokyo Iriya next to Ueno,Taito Ku,Entire home/apt,6,72,2020-03-25,2.11,9,1,入谷,155.380783
2,"Japan Style,Private,Affordable,4min to Sta.",Katsushika Ku,Entire home/apt,1,18,2020-03-23,3.46,288,1,堀切菖蒲園,369.710584
3,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi,Shibuya Ku,Entire home/apt,1,2,2020-04-02,1.76,87,1,初台,626.117356
4,LICENSED SHINJUKU HOUSE: Heart of the action!,Shinjuku Ku,Entire home/apt,1,86,2020-01-30,2.0,156,1,東新宿,265.953843


# 駅情報等を追加し、モデル作成

In [14]:
#total_data = pd.concat([train,test],axis=0)
total_data["near_station"] = distance["near_station"]
#total_data["dist_from_sta"] = distance["dist_from_sta"]
total_data = pd.get_dummies(total_data,columns=one_hot_list)
total_data.drop("index",axis=1,inplace=True)
train,test = total_data[:train.shape[0]],total_data[train.shape[0]:].drop("y",axis=1)

In [15]:
total_data.head()

Unnamed: 0,name,minimum_nights,number_of_reviews,last_review,reviews_per_month,availability_365,y,aa,aakusa,ab,...,neighbourhood_Setagaya Ku,neighbourhood_Shibuya Ku,neighbourhood_Shinagawa Ku,neighbourhood_Shinjuku Ku,neighbourhood_Suginami Ku,neighbourhood_Sumida Ku,neighbourhood_Taito Ku,neighbourhood_Toshima Ku,neighbourhood_0.0,neighbourhood_0.6464790174151639
0,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...,1,55,2020-04-25,2.21,173,12008.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,Downtown Tokyo Iriya next to Ueno,6,72,2020-03-25,2.11,9,6667.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,1,0
2,"Japan Style,Private,Affordable,4min to Sta.",1,18,2020-03-23,3.46,288,9923.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi,1,2,2020-04-02,1.76,87,8109.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
4,LICENSED SHINJUKU HOUSE: Heart of the action!,1,86,2020-01-30,2.0,156,100390.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0


In [16]:
train.columns

Index(['name', 'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'availability_365', 'y', 'aa', 'aakusa', 'ab',
       ...
       'neighbourhood_Setagaya Ku', 'neighbourhood_Shibuya Ku',
       'neighbourhood_Shinagawa Ku', 'neighbourhood_Shinjuku Ku',
       'neighbourhood_Suginami Ku', 'neighbourhood_Sumida Ku',
       'neighbourhood_Taito Ku', 'neighbourhood_Toshima Ku',
       'neighbourhood_0.0', 'neighbourhood_0.6464790174151639'],
      dtype='object', length=7765)

In [17]:
int_columns = train.select_dtypes(exclude="object").columns
#yを対数化
X_train,y_train = train[int_columns].drop(["y"],axis=1),train["y"]
X_test = test.select_dtypes(exclude="object")

In [18]:
X_train.columns

Index(['minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'availability_365', 'aa', 'aakusa', 'ab', 'abc', 'abstract', 'acc',
       ...
       'neighbourhood_Setagaya Ku', 'neighbourhood_Shibuya Ku',
       'neighbourhood_Shinagawa Ku', 'neighbourhood_Shinjuku Ku',
       'neighbourhood_Suginami Ku', 'neighbourhood_Sumida Ku',
       'neighbourhood_Taito Ku', 'neighbourhood_Toshima Ku',
       'neighbourhood_0.0', 'neighbourhood_0.6464790174151639'],
      dtype='object', length=7761)

In [19]:
X_train.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,availability_365,aa,aakusa,ab,abc,abstract,acc,...,neighbourhood_Setagaya Ku,neighbourhood_Shibuya Ku,neighbourhood_Shinagawa Ku,neighbourhood_Shinjuku Ku,neighbourhood_Suginami Ku,neighbourhood_Sumida Ku,neighbourhood_Taito Ku,neighbourhood_Toshima Ku,neighbourhood_0.0,neighbourhood_0.6464790174151639
0,1,55,2.21,173,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,6,72,2.11,9,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,1,0
2,1,18,3.46,288,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
3,1,2,1.76,87,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
4,1,86,2.0,156,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0


In [20]:
#定義したX_train,y_trainを入れる
scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for tr_idx, va_idx in kf.split(X_train):
    tr_x, va_x = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    tr_y, va_y = y_train.iloc[tr_idx], y_train.iloc[va_idx]
    model.fit(tr_x, np.log1p(tr_y))
    va_pred = np.expm1(model.predict(va_x))
    rmsle = np.sqrt(mean_squared_error(np.log1p(va_y), np.log1p(va_pred)))
    scores.append(rmsle)
    print(rmsle)

# クロスバリデーションの平均のスコアを出力する
print(f'RMSLE: {np.mean(scores):.4f}')

0.6350124219854646
0.6336107272610481
0.5934167525023801
0.5939028592154291
0.614322147770816
RMSLE: 0.6141


In [21]:
from sklearn.model_selection import GridSearchCV
cv = KFold(n_splits=3, shuffle=True, random_state=42)
scoring = 'neg_root_mean_squared_error'
fit_params = {#'callbacks': [early_stopping(
              #    stopping_rounds=10, # 学習時、評価指標がこの回数連続で改善しなくなった時点でストップ
              #    verbose=0)],  # 学習中のコマンドライン出力
              'eval_metric': 'rmse',  # early_stopping_roundsの評価指標
              'eval_set': [(X_train,np.log1p(y_train))]  # early_stopping_roundsの評価指標算出用データ
              }
cv_params = {'reg_alpha': [0.0001, 0.003, 0.1],
             'reg_lambda': [0.0001, 0.1],
             'num_leaves': [2, 3, 4, 6],
             'colsample_bytree': [0.4, 0.7, 1.0],
             #'subsample': [0.4, 1.0],
             #'subsample_freq': [0, 7],
             'min_child_samples': [0, 2, 5, 10]
             }
# グリッドサーチのインスタンス作成
gridcv = GridSearchCV(model, cv_params, cv=cv,
                      scoring=scoring, n_jobs=-1)
# グリッドサーチ実行（学習実行）
gridcv.fit(X_train,np.log1p(y_train), **fit_params)

best_params = gridcv.best_params_
best_score = gridcv.best_score_
print(f'最適パラメータ {best_params}\nスコア {best_score}')



MemoryError: Unable to allocate 393. MiB for an array with shape (7729, 6660) and data type float64

In [21]:
#model = RFR()
model.fit(X_train,np.log1p(y_train))
pred = np.expm1(model.predict(X_test))

In [22]:
submission = pd.DataFrame({"id":id,"y":pred})
submission

Unnamed: 0,id,y
0,1,13962.862752
1,2,28583.548606
2,3,11841.506528
3,4,13965.295658
4,5,9193.301903
...,...,...
4991,4992,19040.225473
4992,4993,5476.715084
4993,4994,11093.940434
4994,4995,4854.994312


In [23]:
submission.to_csv("submission.csv",index=False)