In [1]:
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")
staions = pd.read_csv("station_list.csv")

In [3]:
#欠損値削除
train["reviews_per_month"] = train["reviews_per_month"].fillna(0)
train["last_review"] = train["last_review"].fillna("1900-01-01")
test["reviews_per_month"] = test["reviews_per_month"].fillna(0)
test["last_review"] = test["last_review"].fillna("1900-01-01")

In [4]:
#adversarial validationのためのラベル付け
train["label"] = 1
test["label"] = 0
all_data = pd.concat([train,test],axis=0)
train.drop("label",axis=1,inplace=True)
test.drop("label",axis=1,inplace=True)

In [5]:
id = test["id"]
drop_list = ["id","host_id","latitude","longitude"]
train.drop(drop_list,axis=1,inplace=True)
test.drop(drop_list,axis=1,inplace=True)
all_data.drop(["y","id","host_id","latitude","longitude"],axis=1,inplace=True)

In [6]:
all_data.columns

Index(['name', 'neighbourhood', 'room_type', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'availability_365', 'label'],
      dtype='object')

In [7]:
all_data = pd.get_dummies(all_data,columns=["room_type"])

In [8]:
X_all = all_data.drop(["label"],axis=1).select_dtypes(exclude="object")
y_all = all_data["label"]
X_train_adv, X_valid_adv, y_train_adv, y_valid_adv = train_test_split(X_all, y_all, test_size=0.33, random_state=42, shuffle=True)
model = RFR()
model.fit(X_train_adv, y_train_adv)
y_pred = model.predict(X_valid_adv)
print(roc_auc_score(y_valid_adv,y_pred))

0.6654883142848217


In [9]:
for column in all_data.drop("label",axis=1).select_dtypes(exclude="object").columns:
    X_all = all_data.drop(["label",column],axis=1).select_dtypes(exclude="object")
    y_all = all_data["label"]
    X_train_adv, X_valid_adv, y_train_adv, y_valid_adv = train_test_split(X_all, y_all, test_size=0.33, random_state=42, shuffle=True)
    model = RFR()
    model.fit(X_train_adv, y_train_adv)
    y_pred = model.predict(X_valid_adv)
    print("{0}:".format(column),roc_auc_score(y_valid_adv,y_pred))

minimum_nights: 0.627546743604505
number_of_reviews: 0.6296911887431552
reviews_per_month: 0.6359016654989134
availability_365: 0.5926314763823745
room_type_Entire home/apt: 0.6648487461899257
room_type_Hotel room: 0.6659502090799145
room_type_Private room: 0.6626425663202085
room_type_Shared room: 0.6641493028438141


In [10]:
#all_data.groupby("host_id").mean()

In [11]:
#one-hot-encoding
train,test = pd.get_dummies(train,columns=["room_type"]),pd.get_dummies(test,columns=["room_type"])

In [12]:
train.columns

Index(['name', 'neighbourhood', 'minimum_nights', 'number_of_reviews',
       'last_review', 'reviews_per_month', 'availability_365', 'y',
       'room_type_Entire home/apt', 'room_type_Hotel room',
       'room_type_Private room', 'room_type_Shared room'],
      dtype='object')

In [13]:
int_columns = train.select_dtypes(exclude="object").columns
X_train,y_train = train[int_columns].drop(["y"],axis=1),train["y"]
X_test = test.select_dtypes(exclude="object")

In [14]:
X_train.columns

Index(['minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'availability_365', 'room_type_Entire home/apt', 'room_type_Hotel room',
       'room_type_Private room', 'room_type_Shared room'],
      dtype='object')

In [15]:
#定義したX_train,y_trainを入れる
scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for tr_idx, va_idx in kf.split(X_train):
    tr_x, va_x = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    tr_y, va_y = y_train.iloc[tr_idx], y_train.iloc[va_idx]
    model.fit(tr_x, tr_y)
    va_pred = model.predict(va_x)
    rmsle = np.sqrt(mean_squared_error(np.log1p(va_y), np.log1p(va_pred)))
    scores.append(rmsle)
    print(rmsle)

# クロスバリデーションの平均のスコアを出力する
print(f'RMSLE: {np.mean(scores):.4f}')

0.9858424551061419
0.907043401758408
0.9329821920009248
0.9249016352270018
0.9666780476000008
RMSLE: 0.9435


In [16]:
model = RFR()
model.fit(X_train,y_train)
pred = model.predict(X_test)

In [17]:
submission = pd.DataFrame({"id":id,"y":pred})
submission

Unnamed: 0,id,y
0,1,21220.320641
1,2,14828.660000
2,3,90643.610000
3,4,44074.560000
4,5,17720.880000
...,...,...
4991,4992,10954.580000
4992,4993,6699.720000
4993,4994,10793.520000
4994,4995,8509.820000


In [18]:
submission.to_csv("submission.csv",index=False)