In [1]:
from google.colab import drive 
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
%cd '/content/drive/My Drive/Google colab Save/kaggle/2.house'

/content/drive/My Drive/Google colab Save/kaggle/2.house


TUTORIAL CODE from [this link](https://www.nishika.com/competitions/11/topics/79)

In [None]:
! pip install xfeat

### import libraries

In [5]:
import re
from functools import partial
from glob import glob

import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from xfeat import (ArithmeticCombinations, ConcatCombination,
                   GBDTFeatureExplorer, GBDTFeatureSelector, LabelEncoder,
                   LambdaEncoder, Pipeline, SelectCategorical, SelectNumerical,
                   TargetEncoder, aggregation)

### Functinos for preprocessing
1. normalize_moyori
  - Change string variables having time information.(How many times take to the nearest station)
2. normalize_area
  - Extract integer info from string.
3. convert_wareki_to_seireki
  - Convert Wareki(Japanese expression) into Seireki(Standard expression)

In [6]:
sns.set()


# データ前処理用関数を作成

def normalize_moyori(moyori):
    if moyori == moyori:
        if moyori == '30分?60分':
            moyori = 45
        elif moyori == '1H?1H30':
            moyori = 75
        elif moyori == '1H30?2H':
            moyori = 105
        elif moyori == '2H?':
            moyori = 120
        moyori = int(moyori)
    return moyori


def normalize_area(area):
    if area == area:
        area = int(re.sub('m\^2未満|㎡以上', '', str(area)))
    return area


def convert_wareki_to_seireki(wareki):
    if wareki == wareki:
        if wareki == '戦前':
            wareki = '昭和20年'
        value = wareki[2:-1]
        if value == '元':
            value = 1
        else:
            value = int(value)
        if '昭和' in wareki:
            seireki = 1925+value
        elif '平成' in wareki:
            seireki = 1988+value
        elif '令和' in wareki:
            seireki = 2018+value
    else:
        seireki = wareki
    return seireki

### Read csv

1. Read all train files, test and sample file for submission.
  - There are multiple files for training, a test file and a sample_submission file.
2. Concatenate train and test file.
3. Drop null columns

In [7]:
# データ読み込み

paths = glob('data/train/*')
train_dfs = []
for path in paths:
    train_df = pd.read_csv(path)
    train_dfs.append(train_df)
train_df = pd.concat(train_dfs)
train_df.reset_index(drop=True, inplace=True)
test_df = pd.read_csv('data/test.csv')

sub_df = pd.read_csv('data/sample_submission.csv')

ID = 'ID'
TARGET = '取引価格（総額）_log'
rm_cols = []

test_df[TARGET] = np.nan
df = pd.concat([train_df, test_df])

rm_cols += ['市区町村コード']

for i, v in df.nunique().iteritems():
    if v <= 1:
        rm_cols.append(i)

train_df.drop(rm_cols, axis=1, inplace=True)
test_df.drop(rm_cols, axis=1, inplace=True)
df = pd.concat([train_df, test_df])


Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.



In [12]:
## We can see why the normalizing functions above are needed.
## ex : 1H?1H30 // 平成4年
df.head()

Unnamed: 0,ID,都道府県名,市区町村名,地区名,最寄駅：名称,最寄駅：距離（分）,間取り,面積（㎡）,建築年,建物の構造,用途,今後の利用目的,都市計画,建ぺい率（％）,容積率（％）,取引時点,改装,取引の事情等,取引価格（総額）_log
0,36000732,徳島県,徳島市,川内町,吉成,1H?1H30,３ＬＤＫ,70,平成4年,ＲＣ,,住宅,市街化調整区域,70.0,200.0,2019年第１四半期,改装済,,7.041393
1,36017660,徳島県,徳島市,南末広町,徳島,30分?60分,３ＬＤＫ,70,平成8年,ＳＲＣ,住宅,,準工業地域,60.0,200.0,2013年第４四半期,改装済,,7.079181
2,36002389,徳島県,徳島市,八万町,二軒屋,16,３ＬＤＫ,75,平成21年,ＲＣ,住宅,住宅,第２種中高層住居専用地域,60.0,200.0,2015年第２四半期,未改装,,7.255273
3,36002867,徳島県,徳島市,南田宮,佐古,12,４ＬＤＫ,85,平成20年,ＲＣ,住宅,住宅,準工業地域,60.0,200.0,2018年第１四半期,未改装,,7.431364
4,36001885,徳島県,徳島市,住吉,徳島,20,３ＬＤＫ,95,,ＲＣ,住宅,住宅,,,,2016年第２四半期,,,7.447158


### Feature Encoding
1. 取引時点_enc
  - 取引時点 column : The date of last transection.
  - 取引時点_enc : Labeled integer variable, Ordered by date(取引時点)

In [13]:
# 特徴量生成

enc_dic = {}
for i, e in enumerate(sorted(list(set(df['取引時点'].values)))):
    enc_dic[e] = i
df['取引時点_enc'] = df['取引時点'].map(enc_dic)

In [14]:
df.head()

Unnamed: 0,ID,都道府県名,市区町村名,地区名,最寄駅：名称,最寄駅：距離（分）,間取り,面積（㎡）,建築年,建物の構造,用途,今後の利用目的,都市計画,建ぺい率（％）,容積率（％）,取引時点,改装,取引の事情等,取引価格（総額）_log,取引時点_enc
0,36000732,徳島県,徳島市,川内町,吉成,1H?1H30,３ＬＤＫ,70,平成4年,ＲＣ,,住宅,市街化調整区域,70.0,200.0,2019年第１四半期,改装済,,7.041393,54
1,36017660,徳島県,徳島市,南末広町,徳島,30分?60分,３ＬＤＫ,70,平成8年,ＳＲＣ,住宅,,準工業地域,60.0,200.0,2013年第４四半期,改装済,,7.079181,33
2,36002389,徳島県,徳島市,八万町,二軒屋,16,３ＬＤＫ,75,平成21年,ＲＣ,住宅,住宅,第２種中高層住居専用地域,60.0,200.0,2015年第２四半期,未改装,,7.255273,39
3,36002867,徳島県,徳島市,南田宮,佐古,12,４ＬＤＫ,85,平成20年,ＲＣ,住宅,住宅,準工業地域,60.0,200.0,2018年第１四半期,未改装,,7.431364,50
4,36001885,徳島県,徳島市,住吉,徳島,20,３ＬＤＫ,95,,ＲＣ,住宅,住宅,,,,2016年第２四半期,,,7.447158,43


In [18]:
te_dic = {}
time_col = '取引時点_enc'
group_col = '都道府県名'

for i in set(df[time_col].values):
    tmp_df = df[df[time_col] < i]
    te_dic[i] = tmp_df.groupby(group_col)[TARGET].agg('mean').to_dict()


In [None]:
# Target Encoding
# ある行の特徴量として平均値を計算するのに、その時点で過去に登場したデータの集計を用いる
# 以下例では、都道府県ごとに各取引時点より過去の値の平均値を用いる
te_dic = {}
time_col = '取引時点_enc'
group_col = '都道府県名'

for i in set(df[time_col].values):
    tmp_df = df[df[time_col] < i]
    te_dic[i] = tmp_df.groupby(group_col)[TARGET].agg('mean').to_dict()


def calc_te(row):
    if row[time_col] in te_dic and row[group_col] in te_dic[row[time_col]]:
        return te_dic[row[time_col]][row[group_col]]
    else:
        return 0


df[group_col+'_te'] = df.apply(calc_te, axis=1)

df['取引時点_何年前'] = df['取引時点'].apply(lambda x: 2020-int(x[:4]))
df.drop(['取引時点'], axis=1, inplace=True)
df['建築年'] = df['建築年'].apply(lambda x: convert_wareki_to_seireki(x))
df['面積（㎡）'] = df['面積（㎡）'].apply(lambda x: normalize_area(x))
df['最寄駅：距離（分）'] = df['最寄駅：距離（分）'].apply(lambda x: normalize_moyori(x))

# 数値データを抽出しておく
num_df = SelectNumerical().fit_transform(df)

# カテゴリカルデータを抽出
encoder = Pipeline([
    SelectCategorical(),
    LabelEncoder(output_suffix=""),
])

le_df = encoder.fit_transform(df)

# 数値データを組み合わせた特徴量生成
encoder = Pipeline(
    [
        SelectNumerical(),
        ArithmeticCombinations(
            input_cols=["面積（㎡）", "容積率（％）"],
            drop_origin=True,
            operator="*",
            r=2,
        ),
    ]
)

num_comb_df = encoder.fit_transform(df)/100

# 集約特徴量生成
agg_dfs = []


def get_agg_df(df, group_col):

    agg_df, agg_cols = aggregation(df,
                                   group_key=group_col,
                                   # '前面道路：幅員（ｍ）',
                                   group_values=['最寄駅：距離（分）',
                                                 '面積（㎡）', '建ぺい率（％）', '容積率（％）'],
                                   agg_methods=['count', 'mean', 'min', 'max'],
                                   )

    return agg_df[agg_cols]


group_col = '市区町村名'
agg_dfs.append(get_agg_df(df, group_col))

# 生成した特徴量を結合
feat_df = pd.concat([num_df, le_df, num_comb_df]+agg_dfs, axis=1)

# モデル構築
train_df = feat_df[feat_df['取引時点_何年前'] > 1]
val_df = feat_df[feat_df['取引時点_何年前'] == 1]
test_df = feat_df[feat_df['取引時点_何年前'] == 0]

feat_cols = [col for col in train_df.columns if col not in rm_cols+[ID, TARGET]]

cat_cols = list(le_df.columns) + ['取引時点_enc']

train_x = train_df[feat_cols]
train_y = train_df[TARGET]
val_x = val_df[feat_cols]
val_y = val_df[TARGET]
test_x = test_df[feat_cols]
test_y = test_df[TARGET]

SEED = 0

params = {
    'objective': 'regression',
    'metric': 'mae',
    'num_leaves': 42,
    'max_depth': 7,
    "feature_fraction": 0.8,
    'subsample_freq': 1,
    "bagging_fraction": 0.95,
    'min_data_in_leaf': 2,
    'learning_rate': 0.1,
    "boosting": "gbdt",
    "lambda_l1": 0.1,
    "lambda_l2": 10,
    "verbosity": -1,
    "random_state": 42,
    "num_boost_round": 50000,
    "early_stopping_rounds": 100
}

train_data = lgb.Dataset(train_x, label=train_y)
val_data = lgb.Dataset(val_x, label=val_y)

model = lgb.train(
    params,
    train_data,
    categorical_feature=cat_cols,
    valid_names=['train', 'valid'],
    valid_sets=[train_data, val_data],
    verbose_eval=100,
)

val_pred = model.predict(val_x, num_iteration=model.best_iteration)
score = mean_absolute_error(val_y, val_pred)

pred_df = pd.DataFrame(sorted(zip(val_x.index, val_pred, val_y)), columns=[
                       'index', 'predict', 'actual'])

feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(
), train_x.columns)), columns=['importance', 'feature'])

print(f'score: {score:.4f}')
# score: 0.0918

# 特徴量の重要度可視化
lgb.plot_importance(model, figsize=(
    12, 8), max_num_features=50, importance_type='gain')
plt.tight_layout()
plt.show()

test_pred = model.predict(test_x, num_iteration=model.best_iteration)

# 投稿ファイル作成
sub_df[TARGET] = test_pred
sub_df.to_csv('../data/output/test_submission.csv', index=False)