In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.optimizers import Adam
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestRegressor as RFR
import re

In [5]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
train.dropna(subset=["賃料"])
temp_y = train["賃料"]
temp_x = train.drop(["賃料"],axis = 1)

x_train,x_valid,y_train,y_valid = train_test_split(temp_x,temp_y,random_state=7777)

In [None]:
class drop_id:
    def __init__(self):
        pass
    def fit(self,x,y):
        return self
    def transform(x):
        return x.drop["id"]
class parse_area_size:
    def __init__(self):
        pass
    def fit(self,x,y):
        return self
    def transform(self,x):
        hoge = x.copy()
        temp = x["面積"].values
        for i in range(len(temp)):
            temp[i] = temp[i][:-2]
        hoge["面積"]　= temp
        return hoge
class parse_how_old:
    def __init__(self):
        self.year_pat = re.compile(r"[0-9]+年")
        self.month_pat = re.compile(r"[0-9]+ヶ月")
    def fit(self,x,y):
        return self
    def transform(self,x):
        hoge = x.copy()
        temp = x["築年数"].values
        add_year = [0 for i in range(len(temp))]
        add_month = [0 for i in range(len(temp))]
        for i in range(len(temp)):
            year = self.year_pat.search(temp[i])[0][:-1]
            month = self.month_pat.search(temp[i])[0][:-2]
            add_year[i] = int(year)
            add_month[i] = int(month)
        hoge.drop("築年数",axis = 1)
        hoge.assign(year=add_year)
        hoge.assign(month= add_month)
        return hoge
    
class parse_contract_time:
    def __init__(self):
        self.teiki_pat = re.compile(r".*\t.*")
        self.year_pat = re.compile(r"[0-9]+年間")
        self.month_pat = re.compile(r"[0-9]+ヶ月間")
        self.due_year_pat = re.compile(r"[0-9]+年")
        self.due_month_pat = re.compile(r"[0-9]+月まで")
        self.double_pat = re.compile(r"[0-9]+年[0-9]+ヶ月間")
    def fit(self,x,y):
        return self
    def transform(self,x):
        hoge = x.copy()
        temp = x["契約期間"].values
        isteiki = [0 for i in range(len(temp))]
        add_year = [0 for i in range(len(temp))]
        add_month = [0 for i in range(len(temp))]
        for i in range(len(temp)):
            if not temp[i] == temp[i]:
                add_year[i] = 2
                add_month[i] = 0   
            if self.teiki_pat.match(temp[i]):
                isteiki[i] = 1
            if self.double_pat.match(temp[i]):
                year = self.due_year_pat.search(temp[i])[0][:-1]
                month = self.month_pat.search(temp[i])[0][:-3]
                add_year[i] = int(year)
                add_month[i] = int(month)
            else:
                if self.due_month_pat.search(temp[i]):
                    year = self.due_year_pat.search(temp[i])[0][:-1]
                    month = self.due_month_pat.search(temp[i])[0][:-3]
                    year = year-2019
                    month = month-9
                    if month < 0:
                        year -= 1
                        month += 12
                    add_year[i] = int(year)
                    add_month[i] = int(month)
                else:
                    if year_pat.match(temp[i]):
                        year = self.year_pat.match(temp[i])[0][-2]
                        month = 0
                    else:
                        year = 0
                        month = self.month_pat.match(temp[i])[0][-3]
                    add_year[i] = int(year)
                    add_month[i] = int(month)
        hoge.drop("契約期間",axis = 1)
        hoge.assign(is_teiki=isteiki)
        hoge.assign(cont_year= add_year)
        hoge.assign(cont_month= add_month)
        return hoge
        
class parse_rooms:
    def __init__(self):
        pass
    def fit(self,x,y):
        return self
    def transform(self,x):
        hoge = x.copy()
        temp = x["間取り"].values
        room = [0 for i in range(len(temp))]
        head = ["R","L","D","K","S"]
        setubi = [[0 for i in range(len(temp))] for j in range(5)]
        for i in range(len(temp)):
            room[i] = int(temp[i][0])
            for j in range(5):
                if head[j] in temp[i]:
                    setubi[j][i] = 1
        hoge.drop("間取り")
        hoge.assign(room = room)
        hoge.assign(R = setubi[0])
        hoge.assign(L = setubi[1])
        hoge.assign(D = setubi[2])
        hoge.assign(K = setubi[3])
        hoge.assign(S = setubi[4])
        return hoge

class structure_label_encoder:
    def __init__(self):
        self.encoder = OneHotEncoder(sparse=False,handle_unknown="ignore")
    def fit(self,x,y):
        self.encoder.fit(x["建物構造"])
    def transform(self,x,y):
        tmp = pd.DataFrame(self.encoder.transform(x["建物構造"]))
        c_num = len(tmp.columns)
        col = []
        for i in range(c_num):
            col.append("str"+str(i))
        tmp.columns = col
        hoge = x.drop["建物構造"]
        return pd.concat([hoge,tmp],axis = 1)
    
class height_encoder:
    def __init__(self):
        pass
    def fit(self,x,y):
        return self
    def transform(self,x):
        hoge = x.copy()
        tmp = x["所在階"].values()
        where = [0 for i in range(len(tmp))]
        what = [0 for i in range(len(tmp))]
        for i in range(len(tmp)):
            x,y = tmp[i].split("／")
            x = int(x[:-1])
            y = int(x[:-2])
            where[i] = x
            what[i] = y
        hoge.drop("所在階")
        hoge.assign(what_floor=where)
        hoge.assign(height_bld=what)
        return hoge

class direction_encoder:
    def __init__(self):
        self.encoder = OneHotEncoder(sparse=False,handle_unknown="ignore")
    def fit(self,x,y):
        self.encoder.fit(x["方角"])
    def transform(self,x,y):
        tmp = x["方角"]
        tmp.fillna("南")
        tmp = pd.DataFrame(self.encoder.transform(tmp))
        c_num = len(tmp.columns)
        col = []
        for i in range(c_num):
            col.append("dir"+str(i))
        tmp.columns = col
        hoge = x.drop["方角"]
        return pd.concat([hoge,tmp],axis = 1)
class extract_district:
    def __init__(self):
        pass
    def fit(self,x,y):
        return self
    def transform(self,x,y):
        pat = re.compile(r"東京都.+区")
        dist = []
        tmp = x["所在地"].values
        for i in range(len(tmp)):
            m = pat.search(tmp[i])
            dist.append[m[0][3:-1]]
        hoge = x.copy()
        hoge.drop("所在地")
        hoge.assign(district=dist)
        return hoge
            
class district_encoder:
    def __init__(self):
        self.encoder = OneHotEncoder(handle_unknown="ignore",sparse=False)
    def fit(self,x,y):
        self.encoder.fit(x["district"])
    def transform(self,x,y):
        tmp = pd.DataFrame(self.encoder.transform(x["district"]))
        c_num = len(tmp.columns)
        col = []
        for i in range(c_num):
            col.append("dist"+str(i))
        tmp.columns = col
        hoge = x.drop["district"]
        return pd.concat([hoge,tmp],axis = 1)
class access_extractor:
    def __init__(self):
        pass
    def fit(self,x,y):
        return self
    def transform(self,x):
        tmp = x["アクセス"].values
        train = ["" for i in range(len(tmp))]
        walk = [0 for i in range(len(tmp))]
        for i in range(len(tmp)):
            hoge = tmp[i].split()
            train[i] = hoge[0]
            walk[i] = int(hoge[2][2:-1])
        hoge = x.drop("アクセス")
        hoge.assign(train=train)
        hoge.assign(walk= walk)
        return hoge

class train_encoder:
    def __init__(self):
        self.encoder = OneHotEncoder(handle_unknown="ignore",sparse=False)
    def fit(self,x,y):
        self.encoder.fit(x["train"])
    def transform(self,x,y):
        tmp = pd.DataFrame(self.encoder.transform(x["train"]))
        c_num = len(tmp.columns)
        col = []
        for i in range(c_num):
            col.append("train"+str(i))
        tmp.columns = col
        hoge = x.drop["train"]
        return pd.concat([hoge,tmp],axis = 1)

class bath_encoder:
    def __init__(self):
        self.keys = {'専用バス':0,'専用トイレ':1,'バス・トイレ別':2,'シャワー':3,'浴室乾燥機':4,'温水洗浄便座':5,'洗面台独立':6,'脱衣所':7,'追焚機能':8,'共同トイレ':9,'バスなし':10,'共同バス':11}
    def fit(self,x,y):
        return self
    def transform(self,x,y):
        temp = x["バス・トイレ"].values
        setubi = [[0 for i in range(len(self.keys))] for j in range(len(temp))]
        pat = re.compile(r"／")
        for i in range(len(temp)):
            if temp[i] != temp[i]:
                continue
            else:
                block = temp[i].split()
                for b in block:
                    if pat.sub("",b) in self.keys:
                        setubi[i][self.keys[pat.sub("",b)]] = 1
        setubi = pd.DataFrame(setubi)
        c_num = len(setubi.columns)
        col = []
        for i in range(c_num):
            col.append("bath"+str(i))
        setubi.columns = col
        hoge = x.drop["バス・トイレ"]
        return pd.concat([hoge,setubi],axis = 1)

class kitchin_encoder:
    def __init__(self):
        self.keys = {'ガスコンロ': 0, 'コンロ2口': 1, 'システムキッチン': 2, '給湯': 3, '独立キッチン': 4,
                     'コンロ3口': 5, 'IHコンロ': 6, 'コンロ1口': 7, '冷蔵庫あり': 8, 'コンロ設置可': 9,
                     'カウンターキッチン': 10, 'L字キッチン': 11, '電気コンロ': 12, 'コンロ4口以上': 13}
        return self1
    def transform(self,x):
        temp = x["キッチン"].values
        setubi = [[0 for i in range(len(self.keys))] for j in range(len(temp))]
        pat = re.compile(r"／")
        p2 = re.compile(r"コンロ設置可.*")
        for i in range(len(temp)):
            if temp[i] != temp[i]:
                        continue
            else:
                block = temp[i].split()
                    for b in block:
                        f = pat.sub(b)
                        if p2.match(f):
                            f = "コンロ設置可"
                        if f in self.keys:
                            setubi[i][self.keys[f]] = 1
        setubi = pd.DataFrame(setubi)
        c_num = len(setubi.columns)
        col = []
        for i in range(c_num):
            col.append("kit"+str(i))
        setubi.columns = col
        hoge = x.drop["キッチン"]
        return pd.concat([hoge,setubi],axis = 1)

class fac_encoder:
    def __init__(self):
        self.keys = {'エアコン付': 0, 'シューズボックス': 1, 'バルコニー': 2, 'フローリング': 3,
                     '室内洗濯機置場': 4, '敷地内ごみ置き場': 5, 'エレベーター': 6, '公営水道': 7,
                     '下水': 8, '都市ガス': 9, 'タイル張り': 10, 'ウォークインクローゼット': 11, '2面採光': 12,
                     '24時間換気システム': 13, '3面採光': 14, 'ペアガラス': 15, '専用庭': 16, '水道その他': 17,
                     '冷房': 18, 'クッションフロア': 19, '床暖房': 20, 'プロパンガス': 21, 'ロフト付き': 22,
                     '出窓': 23, 'トランクルーム': 24, 'オール電化': 25, 'ルーフバルコニー': 26, '室外洗濯機置場': 27,
                     '床下収納': 28, 'バリアフリー': 29, '防音室': 30, '二重サッシ': 31, '洗濯機置場なし': 32}
    def fit(self,x,y):
        return self
    def transform(self,x,y):
        temp = x["室内設備"].values
        setubi = [[0 for i in range(len(self.keys))] for j in range(len(temp))]
        pat = re.compile(r"／")
        for i in range(len(temp)):
            if temp[i] != temp[i]:
                continue
            else:
                block = temp[i].split()
                for b in block:
                    if pat.sub("",b) in self.keys:
                        setubi[i][self.keys[pat.sub("",b)]] = 1
        setubi = pd.DataFrame(setubi)
        c_num = len(setubi.columns)
        col = []
        for i in range(c_num):
            col.append("fac"+str(i))
        setubi.columns = col
        hoge = x.drop["室内設備"]
        return pd.concat([hoge,setubi],axis = 1)

class info_encoder:
    def __init__(self):
        self.keys = {'インターネット対応': 0, 'CATV': 1, 'CSアンテナ': 2, 'BSアンテナ': 3,
                     '光ファイバー': 4, '高速インターネット': 5, 'インターネット使用料無料': 6, '有線放送': 7}
    def fit(self,x,y):
        return self
    def transform(self,x,y):
        temp = x["放送・通信"].values
        setubi = [[0 for i in range(len(self.keys))] for j in range(len(temp))]
        pat = re.compile(r"／")
        for i in range(len(temp)):
            if temp[i] != temp[i]:
                continue
            else:
                block = temp[i].split()
                for b in block:
                    if pat.sub("",b) in self.keys:
                        setubi[i][self.keys[pat.sub("",b)]] = 1
        setubi = pd.DataFrame(setubi)
        c_num = len(setubi.columns)
        col = []
        for i in range(c_num):
            col.append("info"+str(i))
        setubi.columns = col
        hoge = x.drop["放送・通信"]
        return pd.concat([hoge,setubi],axis = 1)

class parking_encoder:
    def __init__(self):
        self.exist = [re.compile(r"駐輪場\t空有"),re.compile(r"駐車場\t空有"),re.compile(r"バイク置き場\t空有")]
    def fit(self,x,y):
        return self
    def transform(self,x):
        temp = x["駐車場"].values
        d = [[0 for i in range(6)] for j in range(len(temp))]
        for i in range(len(temp)):
            if temp[i] != temp[i]:
                continue
            for j in range(len(self.exist)):
                if self.exist[j].search(temp[i]):
                    d[i][j] += 1
                else:
                    d[i][j+3] += 1
        setubi = pd.DataFrame(d)
        c_num = len(setubi.columns)
        col = []
        for i in range(c_num):
            col.append("park"+str(i))
        setubi.columns = col
        hoge = x.drop["駐車場"]
        return pd.concat([hoge,setubi],axis = 1)

class env_encoder:
    def __init__(self):
        self.keys = {'【小学校】': 0, '【大学】': 1, '【公園】': 2, '【飲食店】': 3,
                     '【スーパー】': 4, '【コンビニ】': 5, '【ドラッグストア】': 6, '【郵便局】': 7,
                     '【病院】': 8, '【図書館】': 9, '【銀行】': 10, '【学校】': 11, '【幼稚園・保育園】': 12,
                     '【総合病院】': 13, '【デパート】': 14, '【レンタルビデオ】': 15, '【クリーニング】': 16}
    def fit(self,x,y):
        return self
    
    def transform(self,x):
        temp = x["周辺環境"].values
        setubi = [[0 for i in range(len(self.keys))] for j in range(len(temp))]
        pat = re.compile(r"／")
        p2 = re.compile("【.*】")
        for i in range(len(temp)):
            if temp[i] != temp[i]:
                continue
            else:
                block = temp[i].split()
                for b in block:
                    key = pat.sub("",b)
                    if p2.match(key):
                        if key in self.keys:
                            setubi[i][self.keys[key]] = 1                 
        setubi = pd.DataFrame(setubi)
        c_num = len(setubi.columns)
        col = []
        for i in range(c_num):
            col.append("env"+str(i))
        setubi.columns = col
        hoge = x.drop["周辺環境"]
        return pd.concat([hoge,setubi],axis = 1)
        
        
        
        
        
    
        
steps = [
    ("drop_id",drop_id()),
    ("parse_area",parse_area_size()),
    ("parse_old",parse_how_old()),
    ("parse_cont",parse_contract_time()),
    ("parse_room",parse_rooms()),
    ("str_label",structure_label_encoder()),
    ("height_enc",height_encoder()),
    ("dir_enc",direction_encoder()),
    ("ex_dist",extract_district()),
    ("label_dist",district_encoder()),
    ("acc_ext",access_extractor()),
    ("train_encode",train_encoder()),
    ("bath_encoder",bath_encoder()),
    ("kit_encoder",kitchin_encoder()),
    ("fac_encoder",fac_encoder()),
    ("info_encoder",info_encoder()),
    ("parking_encoder",parking_encoder()),
    ("env_encoder",env_encoder()),
    ("xgb",xgb.XGBRegressor())
]

pipe = Pipeline(steps=steps)