In [4]:
import glob
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae

files = glob.glob("csv/train/*.csv")
data_list = []
for file in files:
    data_list.append(pd.read_csv(file, index_col=0))
df = pd.concat(data_list)

  data_list.append(pd.read_csv(file, index_col=0))
  data_list.append(pd.read_csv(file, index_col=0))


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 637351 entries, 1060685 to 47003572
Data columns (total 27 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   種類            637351 non-null  object 
 1   地域            0 non-null       float64
 2   市区町村コード       637351 non-null  int64  
 3   都道府県名         637351 non-null  object 
 4   市区町村名         637351 non-null  object 
 5   地区名           637060 non-null  object 
 6   最寄駅：名称        634732 non-null  object 
 7   最寄駅：距離（分）     614306 non-null  object 
 8   間取り           615609 non-null  object 
 9   面積（㎡）         637351 non-null  object 
 10  土地の形状         0 non-null       float64
 11  間口            0 non-null       float64
 12  延床面積（㎡）       0 non-null       float64
 13  建築年           619117 non-null  object 
 14  建物の構造         623423 non-null  object 
 15  用途            591214 non-null  object 
 16  今後の利用目的       275091 non-null  object 
 17  前面道路：方位       0 non-null       float64
 18  前

In [8]:
"""
前処理
XGBoostの例と同じなので、コピペ
"""
def data_pre(df):
    nonnull_list = []
    for col in df.columns:
        nonnull = df[col].count()
        if nonnull == 0:
            nonnull_list.append(col)
    df = df.drop(nonnull_list, axis=1)

    df = df.drop("市区町村名", axis=1)

    df = df.drop("種類", axis=1)

    dis = {
        "30分?60分":45,
        "1H?1H30":75,
        "2H?":120,
        "1H30?2H":105
    }
    df["最寄駅：距離（分）"] = df["最寄駅：距離（分）"].replace(dis).astype(float)

    df["面積（㎡）"] = df["面積（㎡）"].replace("2000㎡以上", 2000).astype(float)


    y_list = {}
    for i in df["建築年"].value_counts().keys():
        if "平成" in i:
            num = float(i.split("平成")[1].split("年")[0])
            year = 33 - num
        if "令和" in i:
            num = float(i.split("令和")[1].split("年")[0])
            year = 3 - num
        if "昭和" in i:
            num = float(i.split("昭和")[1].split("年")[0])
            year = 96 - num
        y_list[i] = year
    y_list["戦前"] = 76
    df["建築年"] = df["建築年"].replace(y_list)

    year = {
        "年第１四半期": ".25",
        "年第２四半期": ".50",
        "年第３四半期": ".75",
        "年第４四半期": ".99"
    }
    year_list = {}
    for i in df["取引時点"].value_counts().keys():
        for k, j in year.items():
            if k in i:
                year_rep = i.replace(k, j)
        year_list[i] = year_rep
    df["取引時点"] = df["取引時点"].replace(year_list).astype(float)
    
    for col in ["都道府県名", "地区名", "最寄駅：名称", "間取り", "建物の構造", "用途", "今後の利用目的", "都市計画", "改装", "取引の事情等"]:
                df[col] = df[col].astype("category")
    
    return df
    
df = data_pre(df)


In [19]:
df_train, df_val = train_test_split(df, test_size=0.2)

col = '取引価格（総額）_log'
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)

val_y = df_val[col]
val_x = df_val.drop(col, axis=1)

trains = lgb.Dataset(train_x, train_y)
valids = lgb.Dataset(val_x, val_y)

params = {
    'objective':'regression',
    'metrics':'mae'
}


model = lgb.train(params, trains, valid_sets=valids, num_boost_round=1000, callbacks=[lgb.early_stopping(stopping_rounds=100,verbose=True)])

"""
lgb.train（）のearly_stopping_raound引数がなくなったので、callbackにlgb.early_stopping()メソッドを使用することでearly_stoppingを可能にする。

"""

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064771 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11646
[LightGBM] [Info] Number of data points in the train set: 509880, number of used features: 17
[LightGBM] [Info] Start training from score 7.217676
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[953]	valid_0's l1: 0.0767385


In [42]:
#テストデータでの予測値算出

df_test = pd.read_csv('csv/test.csv', index_col=0)
df_test = data_pre(df_test)

In [44]:
predict = model.predict(df_test)
df_test["取引価格（総額）_log"] = predict
df_test['取引価格（総額）_log'].to_csv('submit_test.csv')

## 画像認識タスクでの実装

In [47]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from tensorflow.keras.datasets import mnist
from sklearn.model_selection import train_test_split

In [48]:
np.random.seed(0)
(X_train, labels_train), (X_test, labels_test) = mnist.load_data()

In [52]:
# Training set を学習データ（X_train, labels_train）と検証データ（X_validation, labels_validation）に8:2で分割する
X_train,X_validation,labels_train,labels_validation = train_test_split(X_train,labels_train, test_size = 0.2)

# 各画像は行列なので1次元に変換→X_train,X_validation,X_testを上書き(28 x 28 = 784)
X_train = X_train.reshape(-1,784)
X_validation = X_validation.reshape(-1,784)
X_test = X_test.reshape(-1,784)

#正規化
X_train = X_train.astype('float32')
X_validation = X_validation.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_validation /= 255
X_test /= 255

In [55]:
train_data = lgb.Dataset(X_train, label=labels_train)
eval_data = lgb.Dataset(X_validation, label=labels_validation)

#時間測定
import time
start = time.time()

params = {
    'task':'train',
    'boosting':'gbdt',
    'objective':'multiclass',
    'num_class':10
}

gbm = lgb.train(
params,
train_data,
valid_sets=eval_data,
num_boost_round=100,
callbacks=[lgb.early_stopping(stopping_rounds=100,verbose=True)],
)


#各データに対応する予想確率スコア
preds = gbm.predict(X_test)
preds


y_pred=[]
for x in preds:
    
    #確率スコアで最も高い確率を持つクラス（インデックス）を格納
    y_pred.append(np.argmax(x))
    
from sklearn.metrics import accuracy_score
print('accuracy_score:{}'.format(accuracy_score(labels_test, y_pred)))

#経過時間
print('elapsed_timetime:{}'.format(time.time()-start))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.245346 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 106940
[LightGBM] [Info] Number of data points in the train set: 48000, number of used features: 621
[LightGBM] [Info] Start training from score -2.319816
[LightGBM] [Info] Start training from score -2.191677
[LightGBM] [Info] Start training from score -2.304253
[LightGBM] [Info] Start training from score -2.277283
[LightGBM] [Info] Start training from score -2.325982
[LightGBM] [Info] Start training from score -2.401945
[LightGBM] [Info] Start training from score -2.317699
[LightGBM] [Info] Start training from score -2.254390
[LightGBM] [Info] Start training from score -2.332186
[LightGBM] [Info] Start training from score -2.314320
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's multi_logloss: 0.0750335
accuracy_score:0.

In [58]:
preds[0]

array([1.03946033e-06, 3.34329002e-07, 8.98868827e-06, 9.00528217e-05,
       2.98981244e-06, 4.52592236e-06, 1.69127782e-07, 9.99849520e-01,
       2.76789952e-06, 3.96116291e-05])

In [61]:
n_samples = 20000
y = np.ones(n_samples)

In [66]:
np.arange(n_samples).shape

(20000,)