In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.optimizers import Adam
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import scale
from sklearn.ensemble import RandomForestRegressor as RFR
import re

In [3]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
temp_y = train["賃料"]
temp_x = train.drop(["賃料"],axis = 1)

x_train,x_valid,y_train,y_valid = train_test_split(temp_x,temp_y,random_state=7777)

In [None]:
class drop_id:
    def __init__(self):
        pass
    def fit(self,x,y):
        return self
    def transform(x):
        return x.drop["id"]
class parse_area_size:
    def __init__(self):
        pass
    def fit(self,x,y):
        return self
    def transform(self,x):
        hoge = x.copy()
        temp = x["面積"].values
        for i in range(len(temp)):
            temp[i] = temp[i][:-2]
        hoge["面積"]　= temp
        return hoge
class parse_how_old:
    def __init__(self):
        self.year_pat = re.compile(r"[0-9]+年")
        self.month_pat = re.compile(r"[0-9]+ヶ月")
    def fit(self,x,y):
        return self
    def transform(self,x):
        hoge = x.copy()
        temp = x["築年数"].values
        add_year = [0 for i in range(len(temp))]
        add_month = [0 for i in range(len(temp))]
        for i in range(len(temp)):
            year = self.year_pat.search(temp[i])[0][:-1]
            month = self.month_pat.search(temp[i])[0][:-2]
            add_year[i] = int(year)
            add_month[i] = int(month)
        hoge.drop("築年数",axis = 1)
        hoge.assign(year=add_year)
        hoge.assign(month= add_month)
        return hoge
    
class parse_contract_time:
    def __init__(self):
        self.teiki_pat = re.compile(r".*\t.*")
        self.year_pat = re.compile(r"[0-9]+年間")
        self.month_pat = re.compile(r"[0-9]+ヶ月間")
        self.due_year_pat = re.compile(r"[0-9]+年")
        self.due_month_pat = re.compile(r"[0-9]+月まで")
        self.double_pat = re.compile(r"[0-9]+年[0-9]+ヶ月間")
    def fit(self,x,y):
        return self
    def transform(self,x):
        hoge = x.copy()
        temp = x["契約期間"].values
        isteiki = [0 for i in range(len(temp))]
        add_year = [0 for i in range(len(temp))]
        add_month = [0 for i in range(len(temp))]
        for i in range(len(temp)):
            if not temp[i] == temp[i]:
                add_year[i] = 2
                add_month[i] = 0   
            if self.teiki_pat.match(temp[i]):
                isteiki[i] = 1
            if self.double_pat.match(temp[i]):
                year = self.due_year_pat.search(temp[i])[0][:-1]
                month = self.month_pat.search(temp[i])[0][:-3]
                add_year[i] = int(year)
                add_month[i] = int(month)
            else:
                if self.due_month_pat.search(temp[i]):
                    year = self.due_year_pat.search(temp[i])[0][:-1]
                    month = self.due_month_pat.search(temp[i])[0][:-3]
                    year = year-2019
                    month = month-9
                    if month < 0:
                        year -= 1
                        month += 12
                    add_year[i] = int(year)
                    add_month[i] = int(month)
                else:
                    if year_pat.match(temp[i]):
                        year = self.year_pat.match(temp[i])[0][-2]
                        month = 0
                    else:
                        year = 0
                        month = self.month_pat.match(temp[i])[0][-3]
                    add_year[i] = int(year)
                    add_month[i] = int(month)
        hoge.drop("契約期間",axis = 1)
        hoge.assign(is_teiki=isteiki)
        hoge.assign(cont_year= add_year)
        hoge.assign(cont_month= add_month)
        return hoge
        
class parse_rooms:
    def __init__(self):
        pass
    def fit(self,x,y):
        return self
    def transform(self,x):
        hoge = x.copy()
        temp = x["間取り"].values
        room = [0 for i in range(len(temp))]
        head = ["R","L","D","K","S"]
        setubi = [[0 for i in range(len(temp))] for j in range(5)]
        for i in range(len(temp)):
            room[i] = int(temp[i][0])
            for j in range(5):
                if head[j] in temp[i]:
                    setubi[j][i] = 1
        hoge.drop("間取り")
        hoge.assign(room = room)
        hoge.assign(R = setubi[0])
        hoge.assign(L = setubi[1])
        hoge.assign(D = setubi[2])
        hoge.assign(K = setubi[3])
        hoge.assign(S = setubi[4])
        return hoge

class structure_label_encoder:
    def __init__(self):
        self.encoder = OneHotEncoder(sparse=False,handle_unknown="ignore")
    def fit(self,x,y):
        self.encoder.fit(x["建物構造"])
    def transform(self,x,y):
        tmp = pd.DataFrame(self.encoder.transform(x["建物構造"]))
        c_num = len(tmp.columns)
        col = []
        for i in range(c_num):
            col.append("str"+str(i))
        tmp.columns = col
        hoge = x.drop["建物構造"]
        return pd.concat([hoge,tmp],axis = 1)
    
class height_encoder:
    def __init__(self):
        pass
    def fit(self,x,y):
        return self
    def transform(self,x):
        hoge = x.copy()
        tmp = x["所在階"].values()
        where = [0 for i in range(len(tmp))]
        what = [0 for i in range(len(tmp))]
        for i in range(len(tmp)):
            x,y = tmp[i].split("／")
            x = int(x[:-1])
            y = int(x[:-2])
            where[i] = x
            what[i] = y
        hoge.assign(what_floor=where)
        hoge.assign(height_bld=what)
        return hoge
        
             
        
steps = [
    ("drop_id",drop_id()),
    ("parse_area",parse_area_size()),
    ("parse_old",parse_how_old()),
    ("parse_cont",parse_contract_time()),
    ("parse_room",parse_rooms()),
    ("str_label",structure_label_encoder()),
    ("height_enc",height_encoder()),
    ("xgb",xgb.XGBRegressor())
]

pipe = Pipeline(steps=steps)