In [1]:
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

# データ読み込み

In [2]:
train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")
stations = pd.read_csv("station_list.csv")

In [3]:
#欠損値削除
train["reviews_per_month"] = train["reviews_per_month"].fillna(0)
train["last_review"] = train["last_review"].fillna("1900-01-01")
test["reviews_per_month"] = test["reviews_per_month"].fillna(0)
test["last_review"] = test["last_review"].fillna("1900-01-01")

In [4]:
!station_distance.py

distance.csv already existed


In [5]:
distance = pd.read_csv("distance.csv")
#adversarial validationのためのラベル付け
train["label"] = 1
test["label"] = 0
all_data = pd.concat([train,test],axis=0)
all_data["near_station"] = distance["near_station"]
all_data["dist_from_sta"] = distance["dist_from_sta"]
train.drop("label",axis=1,inplace=True)
test.drop("label",axis=1,inplace=True)

# 特徴量追加

In [6]:
train["180_over"] = (train["availability_365"] > 180).astype(int)
test["180_over"] = (test["availability_365"] > 180).astype(int)
all_data["180_over"] = (all_data["availability_365"] > 180).astype(int)

In [7]:
all_data.head()

Unnamed: 0,id,name,host_id,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,availability_365,y,label,near_station,dist_from_sta,180_over
0,1,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...,242899459,Koto Ku,35.68185,139.8031,Entire home/apt,1,55,2020-04-25,2.21,173,12008.0,1,清澄白河,385.676932,0
1,2,Downtown Tokyo Iriya next to Ueno,308879948,Taito Ku,35.72063,139.78536,Entire home/apt,6,72,2020-03-25,2.11,9,6667.0,1,入谷,155.380783,0
2,3,"Japan Style,Private,Affordable,4min to Sta.",300877823,Katsushika Ku,35.74723,139.82349,Entire home/apt,1,18,2020-03-23,3.46,288,9923.0,1,堀切菖蒲園,369.710584,1
3,4,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi,236935461,Shibuya Ku,35.68456,139.68077,Entire home/apt,1,2,2020-04-02,1.76,87,8109.0,1,初台,626.117356,0
4,5,LICENSED SHINJUKU HOUSE: Heart of the action!,243408889,Shinjuku Ku,35.6984,139.70467,Entire home/apt,1,86,2020-01-30,2.0,156,100390.0,1,東新宿,265.953843,0


# 特徴量削除

In [8]:
id = test["id"]
drop_list = ["id","host_id","latitude","longitude"]
train.drop(drop_list,axis=1,inplace=True)
test.drop(drop_list,axis=1,inplace=True)
all_data.drop(["y"],axis=1,inplace=True)
all_data.drop(drop_list,axis=1,inplace=True)

In [9]:
all_data.columns

Index(['name', 'neighbourhood', 'room_type', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'availability_365', 'label', 'near_station', 'dist_from_sta',
       '180_over'],
      dtype='object')

In [10]:
one_hot_list = ["room_type","neighbourhood"]
all_data = pd.get_dummies(all_data,columns=one_hot_list)

In [11]:
X_all = all_data.drop(["label"],axis=1).select_dtypes(exclude="object")
y_all = all_data["label"]
X_train_adv, X_valid_adv, y_train_adv, y_valid_adv = train_test_split(X_all, y_all, test_size=0.33, random_state=42, shuffle=True)
model = RFR()
model.fit(X_train_adv, y_train_adv)
y_pred = model.predict(X_valid_adv)
current_roc_auc_score = roc_auc_score(y_valid_adv,y_pred)
print(current_roc_auc_score)

0.7798816552643362


In [29]:
for column in all_data.drop("label",axis=1).select_dtypes(exclude="object").columns:
    X_all = all_data.drop(["label",column],axis=1).select_dtypes(exclude="object")
    y_all = all_data["label"]
    X_train_adv, X_valid_adv, y_train_adv, y_valid_adv = train_test_split(X_all, y_all, test_size=0.33, random_state=42, shuffle=True)
    scores = []
    kf = KFold(n_splits=5, shuffle=True, random_state=1)
    for tr_idx, va_idx in kf.split(X_all):
        tr_x, va_x = X_all.iloc[tr_idx], X_all.iloc[va_idx]
        tr_y, va_y = y_all.iloc[tr_idx], y_all.iloc[va_idx]
        model.fit(tr_x, tr_y)
        va_pred = model.predict(va_x)
        roc_auc = roc_auc_score(va_y,va_pred)
        scores.append(roc_auc)
    # クロスバリデーションの平均のスコアを出力する
    if current_roc_auc_score - 0.01 > np.mean(scores):
        print("{0}:".format(column),np.mean(scores))

KeyboardInterrupt: 

# 駅情報等を追加し、モデル作成

In [12]:
total_data = pd.concat([train,test],axis=0)
total_data["near_station"] = distance["near_station"]
total_data["dist_from_sta"] = distance["dist_from_sta"]
total_data = pd.get_dummies(total_data,columns=one_hot_list)
train,test = total_data[:train.shape[0]],total_data[train.shape[0]:].drop("y",axis=1)

In [13]:
train.columns

Index(['name', 'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'availability_365', 'y', '180_over',
       'near_station', 'dist_from_sta', 'room_type_Entire home/apt',
       'room_type_Hotel room', 'room_type_Private room',
       'room_type_Shared room', 'neighbourhood_Adachi Ku',
       'neighbourhood_Arakawa Ku', 'neighbourhood_Bunkyo Ku',
       'neighbourhood_Chiyoda Ku', 'neighbourhood_Chuo Ku',
       'neighbourhood_Edogawa Ku', 'neighbourhood_Itabashi Ku',
       'neighbourhood_Katsushika Ku', 'neighbourhood_Kita Ku',
       'neighbourhood_Koto Ku', 'neighbourhood_Meguro Ku',
       'neighbourhood_Minato Ku', 'neighbourhood_Nakano Ku',
       'neighbourhood_Nerima Ku', 'neighbourhood_Ota Ku',
       'neighbourhood_Setagaya Ku', 'neighbourhood_Shibuya Ku',
       'neighbourhood_Shinagawa Ku', 'neighbourhood_Shinjuku Ku',
       'neighbourhood_Suginami Ku', 'neighbourhood_Sumida Ku',
       'neighbourhood_Taito Ku', 'neighbourhood_Toshima Ku'],

In [14]:
int_columns = train.select_dtypes(exclude="object").columns
#yを対数化
X_train,y_train = train[int_columns].drop(["y"],axis=1),train["y"]
X_test = test.select_dtypes(exclude="object")

In [15]:
X_train.columns

Index(['minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'availability_365', '180_over', 'dist_from_sta',
       'room_type_Entire home/apt', 'room_type_Hotel room',
       'room_type_Private room', 'room_type_Shared room',
       'neighbourhood_Adachi Ku', 'neighbourhood_Arakawa Ku',
       'neighbourhood_Bunkyo Ku', 'neighbourhood_Chiyoda Ku',
       'neighbourhood_Chuo Ku', 'neighbourhood_Edogawa Ku',
       'neighbourhood_Itabashi Ku', 'neighbourhood_Katsushika Ku',
       'neighbourhood_Kita Ku', 'neighbourhood_Koto Ku',
       'neighbourhood_Meguro Ku', 'neighbourhood_Minato Ku',
       'neighbourhood_Nakano Ku', 'neighbourhood_Nerima Ku',
       'neighbourhood_Ota Ku', 'neighbourhood_Setagaya Ku',
       'neighbourhood_Shibuya Ku', 'neighbourhood_Shinagawa Ku',
       'neighbourhood_Shinjuku Ku', 'neighbourhood_Suginami Ku',
       'neighbourhood_Sumida Ku', 'neighbourhood_Taito Ku',
       'neighbourhood_Toshima Ku'],
      dtype='object')

In [16]:
#定義したX_train,y_trainを入れる
scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for tr_idx, va_idx in kf.split(X_train):
    tr_x, va_x = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    tr_y, va_y = y_train.iloc[tr_idx], y_train.iloc[va_idx]
    model.fit(tr_x, np.log1p(tr_y))
    va_pred = np.expm1(model.predict(va_x))
    rmsle = np.sqrt(mean_squared_error(np.log1p(va_y), np.log1p(va_pred)))
    scores.append(rmsle)
    print(rmsle)

# クロスバリデーションの平均のスコアを出力する
print(f'RMSLE: {np.mean(scores):.4f}')

0.7249437197716418
0.6987048742063697
0.6706906917559333
0.6739947097645538
0.7117998688497065
RMSLE: 0.6960


In [17]:
model = RFR()
model.fit(X_train,np.log1p(y_train))
pred = np.expm1(model.predict(X_test))

In [18]:
submission = pd.DataFrame({"id":id,"y":pred})
submission

Unnamed: 0,id,y
0,1,15180.149493
1,2,12124.621287
2,3,18437.061251
3,4,16720.485013
4,5,12674.078856
...,...,...
4991,4992,22668.118245
4992,4993,4548.498286
4993,4994,11053.226268
4994,4995,4423.083663


In [19]:
submission.to_csv("submission.csv",index=False)