In [None]:
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse

In [None]:
def data_pre(df):
    #結果に関与しなさそう、あるいは特徴量化できなそうな行はドロップ
    df = df.drop("description", axis=1)
    df = df.drop("name", axis=1)
    df = df.drop("thumbnail_url", axis=1)
    
    #"first_review"は"host_since"と相関強いため削除
    df = df.drop("first_review", axis=1)
    
    #"cleaning_fee""host_has_profile_pic""host_identity_verified""instant_bookable"各カラムを01に変換
    #↑カテゴリカルデータとして指定すればよいのだった。余計な処理。でもやっとく。
    boolean_dic = {"t":1, "f":0}
    df["cleaning_fee"] = df["cleaning_fee"].replace(boolean_dic).astype(float)
    df["host_has_profile_pic"] = df["host_has_profile_pic"].replace(boolean_dic).astype(float)
    df["host_identity_verified"] = df["host_identity_verified"].replace(boolean_dic).astype(float)
    df["instant_bookable"] = df["instant_bookable"].replace(boolean_dic).astype(float)
    
    #"host_since""last_review"カラムを数値に変換。欠損値はpandasのisnullで判定して0を代入。
    host_since_date_dic = {}
    for i in df["host_since"]:
        if pd.isnull(i):
            date_int = 0
            host_since_date_dic[i] = date_int
        else:
            date = str(i)
            date_int = int(date.replace("-", ""))
            host_since_date_dic[i] = date_int
    df["host_since"] = df["host_since"].replace(host_since_date_dic).astype(int)

    last_review_date_dic = {}
    for i in df["last_review"]:
        if pd.isnull(i):
            date_int = 0
            last_review_date_dic[i] = date_int
        else:
            date = str(i)
            date_int = int(date.replace("-", ""))
            last_review_date_dic[i] = date_int
    df["last_review"] = df["last_review"].replace(last_review_date_dic).astype(int)
    
    #"host_response_rate"も末尾の%取ってint型に
    host_response_rate_dic = {}
    for i in df["host_response_rate"]:
        if pd.isnull(i):
            rate_int = 0
            host_response_rate_dic[i] = rate_int
        else:
            rate = i.rstrip("%")
            rate_int = int(rate)
            host_response_rate_dic[i] = rate_int
    df["host_response_rate"] = df["host_response_rate"].replace(host_response_rate_dic).astype(int)
    
    #"city","zipcode","latitude","longitude"の相関を調べたいので"city"を数字に置き換え
    city_dic = {"NYC": 0,
                "LA": 1, 
                "SF": 2, 
                "DC": 3, 
                "Chicago": 4, 
                "Boston": 5
               }
    df["city"] = df["city"].replace(city_dic).astype(int)
    
    #"zipcode"はint型に。
    #if文でstr型のデータから余計な文字を消そうとしたがうまくいかないので、pandasのto_numericを使ってintに変換できない文字が入ってるとこはNaNにしちゃう。
    df["zipcode"] = pd.to_numeric(df["zipcode"], errors='coerce')
    zipcode_dic = {}
    for i in df["zipcode"]:
        if pd.isnull(i):
            zip = 0
            zipcode_dic[i] = zip
        else:
            zip = i
            zipcode_dic[i] = zip
    df["zipcode"] = df["zipcode"].replace(zipcode_dic).astype(int)
    
    #VIF => 10な"latitude""longitude""zipcode"はドロップ
    df = df.drop("latitude", axis=1)
    df = df.drop("longitude", axis=1)
    df = df.drop("zipcode", axis=1)
    
    #"amenities"品目数をスコア化。replaceに時間かかるので最後にやる。
    rep_dic = {}
    for i in df["amenities"]:
        rep = i.replace("{", "")
        rep = rep.replace("}", "")
        rep = rep.replace(" ", "")
        rep_list = sorted(rep.split(","))
        score = int(len(rep_list))
        rep_dic[i] = score
    df["amenities"] = df["amenities"].replace(rep_dic) 
    
    return df   

In [None]:
def model_lgb(df)
    df_train, df_val = train_test_split(df, test_size=0.2)
    
    col = "y"
    train_y = df_train[col]
    train_x = df_train.drop(col, axis=1)
    
    val_y = df_val[col]
    val_x = df_val.drop(col, axis=1)
    
    tradins = lgb.Dataset(train_x, train_y)
    valids = lgb.Dataset(val_x, val_y)
    
    rmse = np.sqrt(mse(train_y, val_y))
    
    #https://lightgbm.readthedocs.io/en/latest/Parameters.html
    #https://knknkn.hatenablog.com/entry/2021/06/29/125226
    params = {
        "objective": "regression",
        "metrics": "rmse"
    }
    
    #カテゴリカルデータをリストで渡す
    categorical_list = ["bed_type", "cancellation_policy", "property_type", "room_type"]
    #https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html
    #https://lightgbm.readthedocs.io/en/latest/Python-API.html
    model = lgb.train(params, 
                      trains, 
                      valid_sets=valids, 
                      categorical_feature=categorical_list, 
                      num_boost_round=10000, 
                      callbacks=[lgb.early_stopping(stopping_rounds=20, verbose=True), lgb.log_evaluation(100)]
                     )
    return model