In [20]:
import os
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from contextlib import contextmanager
from time import time

class Timer:
    def __init__(self, logger=None, format_str="{:.3f}[s]", prefix=None, suffix=None, sep=" "):

        if prefix: format_str = str(prefix) + sep + format_str
        if suffix: format_str = format_str + sep + str(suffix)
        self.format_str = format_str
        self.logger = logger
        self.start = None
        self.end = None

    @property
    def duration(self):
        if self.end is None:
            return 
        return self.end - self.start

    def __enter__(self):
        self.start = time()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end = time()
        out_str = self.format_str.format(self.duration)
        if self.logger:
            self.logger.info(out_str)
        else:
            print(out_str)

%matplotlib inline

In [21]:
train_df = pd.read_csv("../data/train_utf.csv")
test_df = pd.read_csv("../data/test_utf.csv")

In [22]:
def merge_by_id(left_df, right_df):
    return pd.merge(left_df["ID"], right_df, on="ID", how="left").drop(columns=["ID"])

In [23]:
def create_rent_feature(input_df: pd.DataFrame):
    """input_dfは train or test.csv のデータが入ってくることを想定しています."""

    use_columns = [
        "賃料",
        "共益費",
        "敷金/礼金",
        "保証金",
        "専有面積(m2)"
    ]

    return merge_by_id(input_df, train_df)[use_columns]

In [24]:
create_rent_feature(train_df)

Unnamed: 0,賃料,共益費,敷金/礼金,保証金,専有面積(m2)
0,65000,0,1.0,0.000000,18.23
1,38000,0,0.0,0.000000,18.98
2,118000,10000,1.0,1.000000,30.09
3,78000,2000,1.0,1.000000,22.96
4,47000,0,0.0,2.127660,72.06
...,...,...,...,...,...
65613,62100,6820,0.0,1.288245,21.66
65614,99000,11000,3.0,1.000000,59.76
65615,132000,3000,1.0,1.000000,46.35
65616,59000,7000,0.0,0.000000,24.85


In [25]:
def one_hot_encoding(input_df,val):
    out_df = pd.DataFrame()
    target_colname = val
    target_series = input_df[target_colname]
    unique_values = target_series.unique()

    for value in unique_values:
        is_value = target_series == value
        out_df[value] = is_value.astype(int)
    if len (out_df) == len(input_df):
        return out_df
    else:
        return print("something went wrong")

In [26]:
one_hot_encoding(train_df,"種類")

Unnamed: 0,賃貸アパート,賃貸マンション,賃貸一戸建て,賃貸テラス・タウンハウス,賃貸その他
0,1,0,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,1,0,0,0,0
4,0,0,1,0,0
...,...,...,...,...,...
65613,0,1,0,0,0
65614,0,1,0,0,0
65615,1,0,0,0,0
65616,0,1,0,0,0


In [27]:
one_hot_encoding(test_df,"種類")

Unnamed: 0,賃貸アパート,賃貸マンション,賃貸テラス・タウンハウス,賃貸一戸建て,賃貸その他
0,1,0,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0
...,...,...,...,...,...
43744,0,1,0,0,0
43745,0,1,0,0,0
43746,0,1,0,0,0
43747,0,1,0,0,0


In [28]:
def create_room_type_count_encoding(input_df):
    count = input_df["間取り"].map(input_df["間取り"].value_counts())
    encoded_df = pd.DataFrame({
        "間取り_count": count
    })

    return encoded_df

In [29]:
create_room_type_count_encoding(test_df)

Unnamed: 0,間取り_count
0,5684
1,3420
2,7459
3,5684
4,5684
...,...
43744,19602
43745,5684
43746,19602
43747,19602


In [30]:
create_room_type_count_encoding(train_df)

Unnamed: 0,間取り_count
0,29324
1,29324
2,1827
3,29324
4,3071
...,...
65613,29324
65614,3618
65615,8353
65616,29324


In [31]:
def impute(var01):
    walk = var01[0]
    if pd.isnull(walk):
        return 10
    else:
        return walk


In [32]:
def change_floor_year(input_df, floor, year,station,impute):
    input_df[floor] = input_df[floor].replace("平屋","1階建")
    input_df[floor] = input_df[floor].str.extract(r'(\d+)階建').astype(int)
    input_df[year] = input_df[year].replace("新築","築1年")
    input_df[year] = input_df[year].str.extract(r'(\d+)年')
    input_df[year] = input_df[year].astype(int)
    input_df[station] = input_df[station].str.extract(r'(\d+)分')
    input_df[station] = input_df[station].apply(impute,axis=1)
    input_df[station] = input_df[station].astype(int)



In [33]:
change_floor_year(train_df,"階数","築年数","最寄り駅1",impute)

TypeError: impute() got an unexpected keyword argument 'axis'