In [25]:
import numpy as np
import pandas as pd 
from sklearn.ensemble import GradientBoostingRegressor as GBDT

import glob

In [2]:
def merge_train_data(files:list)->pd.DataFrame:
    count = 0
    for file in files:
        train_i = pd.read_csv(file)
        if (count==0):
            df = train_i.copy()
            count+=1
            continue
        df = pd.concat([df,train_i],axis = 0)
    return df

def data_extract(df:pd.DataFrame)->pd.DataFrame:
    # 確実にいらないカラムたち
    data = df.copy()
    data = data.drop(['ID','種類','地域', '都道府県名', '市区町村名', '地区名','土地の形状', '間口', 
                                  '延床面積（㎡）','用途','今後の利用目的','前面道路：方位', '前面道路：種類', 
                                  '前面道路：幅員（ｍ）','取引の事情等','最寄駅：名称'],axis=1) 
    # もしかしたら使うかもしれないカラムたち
    data = data.drop(['市区町村コード','都市計画'],axis=1)
    data = data.fillna({'改装':'未改装'})
    data.rename(columns={'最寄駅：距離（分）':'nearest_station','間取り':'floor_type','面積（㎡）':'area','建築年':'builted_year',
            '建ぺい率（％）':'coverrage','容積率（％）':'floor_rate','取引時点':'sold_year','建物の構造':'structure'},inplace = True)
    return data

def convert_nearest_station(value):
    if value == value:
        if value == '30分?60分':
            return 45
        elif value ==  '1H?1H30':
            return 75
        elif value =='1H30?2H':
            return 105
        elif value =='2H?':
            return 120
        return float(value)

def convert_area(area):
    if (area=="2000㎡以上"):
        return 2000
    else:
        return int(area)
    
def convert_builted_year(year):
    if year == year:
        wareki = year[:2]
        if(wareki=="戦前"):
            return 1945
        
        if(year[2:len(year)-1]=="元"):
            value =  1
        else:
            value = int(year[2:len(year)-1])

        if(wareki == "昭和"):
            return 1925 + value
        elif(wareki== "平成"):
            return 1988 +value
        elif(wareki=="令和"):
            return 2018 + value
        
def convert_sold_year(year):
    return int(year[:4])
    
            
def data_processing(data:pd.DataFrame)->pd.DataFrame:
    df = data.copy()
    df.nearest_station = df.nearest_station.apply(lambda x: convert_nearest_station(x))
    df.builted_year = df.builted_year.apply(lambda x:convert_builted_year(x))
    df.area = df.area.apply(lambda x:convert_area(x))
    df.sold_year = df.sold_year.apply(lambda x:convert_sold_year(x))
    
    dummies = pd.get_dummies(data[['改装']])
    df = pd.concat([df.drop('改装',axis=1),dummies[['改装_改装済']]],axis = 1)
    
    df = df.drop(["floor_type","structure"],axis=1)
    return df
    
    


In [17]:
files = glob.glob("./data/train/*.csv")

data = merge_train_data(files)
data = data_extract(data)

test = pd.read_csv("./data/test.csv")
test = data_extract(test)

In [18]:
data = data_processing(data)

test = data_processing(test)



Unnamed: 0,nearest_station,area,builted_year,coverrage,floor_rate,sold_year,改装_改装済
0,1.0,75,2016.0,80.0,600.0,2021,0
1,0.0,30,1977.0,80.0,400.0,2021,1
2,3.0,70,1984.0,80.0,400.0,2021,1
3,2.0,50,1989.0,80.0,400.0,2021,0
4,2.0,45,1991.0,80.0,400.0,2021,1


In [27]:
train = data.copy()
train.isna().sum()
train.fillna(0,inplace = True)

train_y = train["取引価格（総額）_log"]
train_X = train.drop("取引価格（総額）_log",axis = 1)


test_X = test.fillna(0)

gbdt = GBDT()
gbdt.fit(train_X,train_y)
pred = gbdt.predict(test_X)

submit = pd.read_csv('./data/sample_submission.csv')
submit['取引価格（総額）_log']=pred
submit.to_csv('./submits/first_submit_2.csv',index=False)

In [21]:
train.head()

Unnamed: 0,nearest_station,area,builted_year,coverrage,floor_rate,sold_year,取引価格（総額）_log,改装_改装済
0,7.0,45,1988.0,60.0,150.0,2021,7.041393,1
1,4.0,15,1989.0,80.0,400.0,2020,6.60206,0
2,18.0,80,1988.0,60.0,150.0,2020,7.39794,1
3,5.0,60,2006.0,80.0,400.0,2020,7.278754,0
4,6.0,60,1987.0,80.0,400.0,2007,6.929419,0
