In [1]:
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from sklearn.decomposition import PCA

# データ読み込み

In [2]:
train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")
stations = pd.read_csv("station_list.csv")

In [3]:
#欠損値削除
train["reviews_per_month"] = train["reviews_per_month"].fillna(0)
train["last_review"] = train["last_review"].fillna("1900-01-01")
test["reviews_per_month"] = test["reviews_per_month"].fillna(0)
test["last_review"] = test["last_review"].fillna("1900-01-01")

In [4]:
!station_distance.py

distance.csv already existed


In [5]:
distance = pd.read_csv("distance.csv")
#adversarial validationのためのラベル付け
train["label"] = 1
test["label"] = 0
all_data = pd.concat([train,test],axis=0)
all_data["near_station"] = distance["near_station"]
all_data["dist_from_sta"] = distance["dist_from_sta"]
train.drop("label",axis=1,inplace=True)
test.drop("label",axis=1,inplace=True)

# 特徴量追加

In [6]:
#180日以上かどうか
train["180_over"] = (train["availability_365"] > 180).astype(int)
test["180_over"] = (test["availability_365"] > 180).astype(int)
all_data["180_over"] = (all_data["availability_365"] > 180).astype(int)
model = lgb.LGBMRegressor()
one_hot_list = ["room_type","neighbourhood","minimum_nights"]

In [7]:
all_data.head()

Unnamed: 0,id,name,host_id,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,availability_365,y,label,near_station,dist_from_sta,180_over
0,1,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...,242899459,Koto Ku,35.68185,139.8031,Entire home/apt,1,55,2020-04-25,2.21,173,12008.0,1,清澄白河,385.676932,0
1,2,Downtown Tokyo Iriya next to Ueno,308879948,Taito Ku,35.72063,139.78536,Entire home/apt,6,72,2020-03-25,2.11,9,6667.0,1,入谷,155.380783,0
2,3,"Japan Style,Private,Affordable,4min to Sta.",300877823,Katsushika Ku,35.74723,139.82349,Entire home/apt,1,18,2020-03-23,3.46,288,9923.0,1,堀切菖蒲園,369.710584,1
3,4,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi,236935461,Shibuya Ku,35.68456,139.68077,Entire home/apt,1,2,2020-04-02,1.76,87,8109.0,1,初台,626.117356,0
4,5,LICENSED SHINJUKU HOUSE: Heart of the action!,243408889,Shinjuku Ku,35.6984,139.70467,Entire home/apt,1,86,2020-01-30,2.0,156,100390.0,1,東新宿,265.953843,0


# 特徴量削除

In [8]:
id = test["id"]
drop_list = ["id","host_id","latitude","longitude","180_over"]
train.drop(drop_list,axis=1,inplace=True)
test.drop(drop_list,axis=1,inplace=True)
all_data.drop(["y"],axis=1,inplace=True)
all_data.drop(drop_list,axis=1,inplace=True)

In [9]:
all_data.columns

Index(['name', 'neighbourhood', 'room_type', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'availability_365', 'label', 'near_station', 'dist_from_sta'],
      dtype='object')

# Adversarial validation

In [10]:
all_data = pd.get_dummies(all_data,columns=one_hot_list)

In [11]:
X_all = all_data.drop(["label"],axis=1).select_dtypes(exclude="object")
y_all = all_data["label"]
X_train_adv, X_valid_adv, y_train_adv, y_valid_adv = train_test_split(X_all, y_all, test_size=0.33, random_state=42, shuffle=True)
model.fit(X_train_adv, y_train_adv)
y_pred = model.predict(X_valid_adv)
current_roc_auc_score = roc_auc_score(y_valid_adv,y_pred)
print(current_roc_auc_score)

0.723498135313605


In [12]:
for column in all_data.drop("label",axis=1).select_dtypes(exclude="object").columns:
    X_all = all_data.drop(["label",column],axis=1).select_dtypes(exclude="object")
    y_all = all_data["label"]
    X_train_adv, X_valid_adv, y_train_adv, y_valid_adv = train_test_split(X_all, y_all, test_size=0.33, random_state=42, shuffle=True)
    scores = []
    kf = KFold(n_splits=5, shuffle=True, random_state=1)
    for tr_idx, va_idx in kf.split(X_all):
        tr_x, va_x = X_all.iloc[tr_idx], X_all.iloc[va_idx]
        tr_y, va_y = y_all.iloc[tr_idx], y_all.iloc[va_idx]
        model.fit(tr_x, tr_y)
        va_pred = model.predict(va_x)
        roc_auc = roc_auc_score(va_y,va_pred)
        scores.append(roc_auc)
    # クロスバリデーションの平均のスコアを出力する
    if current_roc_auc_score - 0.01 > np.mean(scores):
        print("{0}:".format(column),np.mean(scores))


availability_365: 0.6866002963638395


# name編集

In [13]:
corpus = []
#
#name_df = all_data[all_data["name"].str.contains("[a-z]",regex=True) & ~all_data["name"].str.contains("[一-龥ぁ-んァ-ン]",regex=True)]["name"].str.lower()
name_df = all_data["name"].str.lower()
name_df = name_df.str.replace('[0-9!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〜★〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]'," ",regex=True).str.replace("   "," ").str.replace("  "," ")
name_df.apply(lambda x: corpus.append(x))
#コーパスの作成
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
name_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names()).iloc[:,:]
name_df.shape



(14986, 7729)

# 次元圧縮

In [14]:
# 行列の標準化
dfs = name_df.iloc[:, :].apply(lambda x: (x-x.mean())/x.std(), axis=0)
dfs.head()

Unnamed: 0,aa,aakusa,ab,abc,abstract,acc,accepted,access,accessed,accessible,...,ﾙｰﾌﾄｯﾌﾟにｼﾞｬｸﾞｼﾞｰあり,ﾙｰﾌﾊﾞﾙｺﾆｰとﾃﾚﾋﾞ付きお風呂で旅の疲れを吹っ飛ばす,ﾙｰﾑ,ﾚｽﾄﾗﾝ,ﾚﾄﾛｷｭｰﾄな一軒家,ﾚﾝﾀﾙ着物,ﾛｰｶﾙﾗｲﾌ,ﾛﾌﾄ付き,ﾛﾌﾄﾙｰﾑ,ﾜﾝﾌﾛｱ貸切
0,-0.024999,-0.011553,-0.021564,-0.011553,-0.008169,-0.02311,-0.008169,-0.163433,-0.008169,-0.054504,...,-0.008169,-0.008169,-0.008169,-0.008169,-0.008169,-0.008169,-0.011553,-0.011549,-0.008169,-0.008169
1,-0.024999,-0.011553,-0.021564,-0.011553,-0.008169,-0.02311,-0.008169,-0.163433,-0.008169,-0.054504,...,-0.008169,-0.008169,-0.008169,-0.008169,-0.008169,-0.008169,-0.011553,-0.011549,-0.008169,-0.008169
2,-0.024999,-0.011553,-0.021564,-0.011553,-0.008169,-0.02311,-0.008169,-0.163433,-0.008169,-0.054504,...,-0.008169,-0.008169,-0.008169,-0.008169,-0.008169,-0.008169,-0.011553,-0.011549,-0.008169,-0.008169
3,-0.024999,-0.011553,-0.021564,-0.011553,-0.008169,-0.02311,-0.008169,-0.163433,-0.008169,-0.054504,...,-0.008169,-0.008169,-0.008169,-0.008169,-0.008169,-0.008169,-0.011553,-0.011549,-0.008169,-0.008169
4,-0.024999,-0.011553,-0.021564,-0.011553,-0.008169,-0.02311,-0.008169,-0.163433,-0.008169,-0.054504,...,-0.008169,-0.008169,-0.008169,-0.008169,-0.008169,-0.008169,-0.011553,-0.011549,-0.008169,-0.008169


In [15]:
"""from sklearn.decomposition import PCA
#主成分分析の実行
pca = PCA(n_components=3450)
features = pca.fit_transform(dfs)
# データを主成分空間に写像
pca_df = pd.DataFrame(features)"""

In [16]:
#pca.explained_variance_ratio_[:3450].sum()

0.9002269776332812

In [17]:
#主成分分析の実行
pca = PCA(n_components=3860)
features = pca.fit_transform(dfs)
# データを主成分空間に写像
pca_df = pd.DataFrame(features)

'from sklearn.decomposition import PCA\n#主成分分析の実行\npca = PCA(n_components=3860)\nfeatures = pca.fit_transform(dfs)\n# データを主成分空間に写像\npca_df = pd.DataFrame(features)'

In [18]:
pca.explained_variance_ratio_[:3860].sum()

In [19]:
pca_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3440,3441,3442,3443,3444,3445,3446,3447,3448,3449
0,-0.06492,-0.027888,-0.048365,-0.001878,-0.061959,-0.070099,-0.021363,-0.027043,-0.197053,-0.049179,...,-1.453037,0.073147,-1.903173,0.613837,0.20507,-0.042842,-0.913574,-1.713028,0.616476,-0.044628
1,-0.065992,-0.028716,-0.049287,-0.003876,-0.066142,-0.072154,-0.018858,-0.027474,-0.2184,-0.049521,...,-0.189521,-0.826157,-0.354668,-0.463706,-0.276317,1.283111,0.594666,0.597338,1.122335,0.417863
2,-0.070302,-0.030739,-0.05451,-0.004086,-0.070263,-0.078099,-0.001755,-0.024265,-0.230869,-0.010094,...,0.349841,0.247762,1.279089,-0.659497,0.979016,0.296184,-0.691118,0.521736,-0.670789,1.515905
3,-0.111994,-0.052057,-0.093409,-0.007855,-0.126574,-0.14933,-0.008359,-0.049043,-0.411388,-0.102277,...,-0.099892,0.351063,0.37213,-0.262068,-0.390302,-0.041187,0.575261,0.139879,-0.443263,-0.029537
4,-0.062313,-0.026222,-0.046553,-0.002713,-0.054978,-0.06072,-0.020913,-0.02492,-0.186652,-0.041252,...,-1.666309,-2.748309,-0.083537,-1.160172,-2.30897,1.627729,-1.266699,-0.221426,0.379149,4.74145


# 駅情報等を追加し、モデル作成

In [20]:
total_data = pd.concat([train,test],axis=0).reset_index()
#total_data = pd.concat([total_data,name_df],axis=1)
total_data = pd.concat([total_data,pca_df],axis=1)
total_data.shape
train = total_data.iloc[:train.shape[0],:]
test = total_data.iloc[train.shape[0]:,:]

In [21]:
#total_data = pd.concat([train,test],axis=0)
total_data["near_station"] = distance["near_station"]
#total_data["dist_from_sta"] = distance["dist_from_sta"]
total_data = pd.get_dummies(total_data,columns=one_hot_list)
total_data.drop("index",axis=1,inplace=True)
train,test = total_data[:train.shape[0]],total_data[train.shape[0]:].drop("y",axis=1)

In [22]:
total_data.head()

Unnamed: 0,name,number_of_reviews,last_review,reviews_per_month,availability_365,y,0,1,2,3,...,minimum_nights_32,minimum_nights_33,minimum_nights_35,minimum_nights_60,minimum_nights_90,minimum_nights_99,minimum_nights_100,minimum_nights_180,minimum_nights_360,minimum_nights_365
0,KiyosumiShirakawa 3min|★SkyTree★|WIFI|Max4|Tre...,55,2020-04-25,2.21,173,12008.0,-0.06492,-0.027888,-0.048365,-0.001878,...,0,0,0,0,0,0,0,0,0,0
1,Downtown Tokyo Iriya next to Ueno,72,2020-03-25,2.11,9,6667.0,-0.065992,-0.028716,-0.049287,-0.003876,...,0,0,0,0,0,0,0,0,0,0
2,"Japan Style,Private,Affordable,4min to Sta.",18,2020-03-23,3.46,288,9923.0,-0.070302,-0.030739,-0.05451,-0.004086,...,0,0,0,0,0,0,0,0,0,0
3,4 min to Shinjuku Sta. by train / 2 ppl / Wi-fi,2,2020-04-02,1.76,87,8109.0,-0.111994,-0.052057,-0.093409,-0.007855,...,0,0,0,0,0,0,0,0,0,0
4,LICENSED SHINJUKU HOUSE: Heart of the action!,86,2020-01-30,2.0,156,100390.0,-0.062313,-0.026222,-0.046553,-0.002713,...,0,0,0,0,0,0,0,0,0,0


In [23]:
train.columns

Index([              'name',  'number_of_reviews',        'last_review',
        'reviews_per_month',   'availability_365',                  'y',
                          0,                    1,                    2,
                          3,
       ...
        'minimum_nights_32',  'minimum_nights_33',  'minimum_nights_35',
        'minimum_nights_60',  'minimum_nights_90',  'minimum_nights_99',
       'minimum_nights_100', 'minimum_nights_180', 'minimum_nights_360',
       'minimum_nights_365'],
      dtype='object', length=3518)

In [24]:
int_columns = train.select_dtypes(exclude="object").columns
#yを対数化
X_train,y_train = train[int_columns].drop(["y"],axis=1),train["y"]
X_test = test.select_dtypes(exclude="object")

In [25]:
X_train.columns

Index([ 'number_of_reviews',  'reviews_per_month',   'availability_365',
                          0,                    1,                    2,
                          3,                    4,                    5,
                          6,
       ...
        'minimum_nights_32',  'minimum_nights_33',  'minimum_nights_35',
        'minimum_nights_60',  'minimum_nights_90',  'minimum_nights_99',
       'minimum_nights_100', 'minimum_nights_180', 'minimum_nights_360',
       'minimum_nights_365'],
      dtype='object', length=3514)

In [26]:
X_train.head()

Unnamed: 0,number_of_reviews,reviews_per_month,availability_365,0,1,2,3,4,5,6,...,minimum_nights_32,minimum_nights_33,minimum_nights_35,minimum_nights_60,minimum_nights_90,minimum_nights_99,minimum_nights_100,minimum_nights_180,minimum_nights_360,minimum_nights_365
0,55,2.21,173,-0.06492,-0.027888,-0.048365,-0.001878,-0.061959,-0.070099,-0.021363,...,0,0,0,0,0,0,0,0,0,0
1,72,2.11,9,-0.065992,-0.028716,-0.049287,-0.003876,-0.066142,-0.072154,-0.018858,...,0,0,0,0,0,0,0,0,0,0
2,18,3.46,288,-0.070302,-0.030739,-0.05451,-0.004086,-0.070263,-0.078099,-0.001755,...,0,0,0,0,0,0,0,0,0,0
3,2,1.76,87,-0.111994,-0.052057,-0.093409,-0.007855,-0.126574,-0.14933,-0.008359,...,0,0,0,0,0,0,0,0,0,0
4,86,2.0,156,-0.062313,-0.026222,-0.046553,-0.002713,-0.054978,-0.06072,-0.020913,...,0,0,0,0,0,0,0,0,0,0


In [27]:
#定義したX_train,y_trainを入れる
scores = []
kf = KFold(n_splits=5, shuffle=True, random_state=1)
for tr_idx, va_idx in kf.split(X_train):
    tr_x, va_x = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    tr_y, va_y = y_train.iloc[tr_idx], y_train.iloc[va_idx]
    model.fit(tr_x, np.log1p(tr_y))
    va_pred = np.expm1(model.predict(va_x))
    rmsle = np.sqrt(mean_squared_error(np.log1p(va_y), np.log1p(va_pred)))
    scores.append(rmsle)
    print(rmsle)

# クロスバリデーションの平均のスコアを出力する
print(f'RMSLE: {np.mean(scores):.4f}')

0.5769367186299041
0.5879105384781059
0.5524966711233051
0.5326592690373959
0.563528387774397
RMSLE: 0.5627


In [28]:
#model = RFR()
model.fit(X_train,np.log1p(y_train))
pred = np.expm1(model.predict(X_test))

In [29]:
submission = pd.DataFrame({"id":id,"y":pred})
submission

Unnamed: 0,id,y
0,1,13083.430511
1,2,38179.872427
2,3,16982.310500
3,4,14427.673366
4,5,12898.900548
...,...,...
4991,4992,14670.165851
4992,4993,9870.697196
4993,4994,11870.357770
4994,4995,4368.330200


In [30]:
submission.to_csv("submission.csv",index=False)