### 컬럼 뜻 정리

- 기존에 있던 것
['key', 'apartment_id', 'city', 'transaction_year_month',
       'transaction_date', 'year_of_completion', 'exclusive_use_area', 'floor',
       'latitude', 'longitude', 'address_by_law',
       'total_parking_capacity_in_site', 'total_household_count_in_sites',
       'apartment_building_count_in_sites', 'tallest_building_in_sites',
       'lowest_building_in_sites', 'heat_type', 'heat_fuel', 'room_id',
       'supply_area', 'total_household_count_of_area_type', 'room_count',
       'bathroom_count', 'front_door_structure', 'transaction_real_price',
       
1. 'logPrice' : 실거래가 로그 정규화
2. 'exAreaPrice', : 전용면적 당 실거래가 = 실거래가 / 전용면적
3. 'supAreaPrice', : 공급면적 당 실거래가 = 실거래가 / 공급면적
4. 'logExAreaPrice', : 전용면적 당 실거래가 로그 정규화
5. 'logSupAreaPrice', : 공급면적 당 실거래가 로그 정규화
6. 'transYear', : 거래 연도
7. 'transMonth', : 거래 월
8. 'transDate', : 거래 일 1,11,21
9. 'transYMD', : yyyymmdd 형태 거래 일자
10. 'transOrdered', : d=1,2,3 / m=3 / y=36으로 계산해서 yyyymmdd형태를 리스케일링
11. 'commonArea', : 공용 면적
12. 'areaRate', : 전용률 = 전용면적/공급면적
13. 'district', : 구 법정동 코드
14. 'town', : 동 법정동 코드
15. 'disTown', : 구+동 법정동 코드
16. 'cityDisTown', : 시+구+동 법정동 코드
17. 'townUnitPrice', : 시+구+동 법정동 코드 별 공급면적 제곱미터 당 실거래가 중위값
18. 'unitPrice', : 공급면적 제곱미터 당 실거래가
19. 'apartUnitPrice', : 아파트 단지별 공급면적 제곱미터 당 실거래가 중위값
20. 'sub800', : 800m 이내 지하철 갯수 (환승역은 노선수를 따로 셈, 1호선+2호선 환승역이면 한 역을 두 번 셈)
21. 'elementSchool800', : 800m 이내 초등학교 갯수
22. 'middleSchool800', : 800m 이내 중학교 갯수
23. 'highSchool800', : 800m 이내 고등학교 갯수
24. 'sub800Price' : 지하철 시+구+동 법정동 코드에 해당하는 unitPrice (800m 이내에 있는 지하철역 가격 중에서 max값)
25. 'logTownUnitPrice', 
26. 'logApartUnitPrice'
27. logSub800Price
28. apart : 단지별 평당가를 활용한 줄세우기

In [1]:
import pandas as pd
import numpy as np
import os, sys, warnings, re, time, gc,math
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.distributions.empirical_distribution import ECDF
from sklearn.linear_model import RidgeCV as ridge
import lightgbm as lgb

warnings.filterwarnings('ignore')
sys.path.append(os.path.abspath(os.path.dirname('../')))
from modules import eda
gc.collect()

0

In [2]:
%%time
# input raw data, 같은 dir에 있다고 가정함
# path = './datasets/origin/'
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
school = pd.read_csv('Schools.csv')
subway = pd.read_csv('Subways.csv')
gc.collect()

Wall time: 5.74 s


In [3]:
# Price
## log price
train['logPrice']=np.log(train['transaction_real_price'])

## ~ price
train['exAreaPrice'] = train.apply(lambda row : row['transaction_real_price']/row['exclusive_use_area'],axis=1)
train['supAreaPrice'] = train.apply(lambda row : row['transaction_real_price']/row['supply_area'],axis=1)
train['logExAreaPrice'] = np.log(train['exAreaPrice'])
train['logSupAreaPrice'] = np.log(train['supAreaPrice'])

# transaction Y-M-D
train['transYear'] = train['transaction_year_month'].apply(lambda row : int(str(row)[:4]))
train['transMonth'] = train['transaction_year_month'].apply(lambda row : int(str(row)[4:]))
train['transDate'] = train['transaction_date'].apply(lambda row : int(row.split('~')[0]))
test['transYear'] = test['transaction_year_month'].apply(lambda row : int(str(row)[:4]))
test['transMonth'] = test['transaction_year_month'].apply(lambda row : int(str(row)[4:]))
test['transDate'] = test['transaction_date'].apply(lambda row : int(row.split('~')[0]))

def transYMD(row):
    ym = str(row['transaction_year_month'])
    date = str(row['transDate'])
    if len(date)==1 : date = '0'+date
    return int(ym+date)
train['transYMD'] = train.apply(lambda row : transYMD(row), axis=1)
test['transYMD'] = test.apply(lambda row : transYMD(row), axis=1)

def transOdered(row):
    data = str(row)
    year = data[:4]
    month = data[4:6]
    date = data[6:]
    if date == '01' : date = 1
    elif date == '11' : date = 2
    else : date = 3
    month = (int(month)-1)*3
    year = (int(year)-2006)*36
    return year+month+date

train['transOrdered'] = train['transYMD'].apply(lambda row : transOdered(row))
test['transOrdered'] = test['transYMD'].apply(lambda row : transOdered(row))

# Area
# common Area : 공용 면적 = 공급면적-전용면적
# area rate : 전용률 = 
train['commonArea'] = train.apply(lambda row : row['supply_area']-row['exclusive_use_area'], axis=1)
test['commonArea'] = test.apply(lambda row : row['supply_area']-row['exclusive_use_area'], axis=1)
train['areaRate'] = train.apply(lambda row : row['exclusive_use_area']/row['supply_area'], axis=1)
test['areaRate'] = test.apply(lambda row : row['exclusive_use_area']/row['supply_area'], axis=1)

# location by law
train['district'] = train['address_by_law'].apply(lambda row : int(str(row)[2:5]))
test['district'] = test['address_by_law'].apply(lambda row : int(str(row)[2:5]))
train['town'] = train['address_by_law'].apply(lambda row : int(str(row)[5:8]))
test['town'] = test['address_by_law'].apply(lambda row : int(str(row)[5:8]))
train['disTown'] = train.apply(lambda row : int(str(row['district'])+str(row['town'])), axis=1)
test['disTown'] = test.apply(lambda row : int(str(row['district'])+str(row['town'])), axis=1)
train['cityDisTown'] = train.apply(lambda row : int(str(row['city'])+str(row['disTown'])), axis=1)
test['cityDisTown'] = test.apply(lambda row : int(str(row['city'])+str(row['disTown'])), axis=1)

# ETC
col = 'heat_fuel'
def heatFuel(x):
    if x =='gas' : return 0
    elif x=='cogeneration': return 1
    else : return 2
train[col] = train[col].apply(lambda row : heatFuel(row))
test[col] = test[col].apply(lambda row : heatFuel(row))

col = 'heat_type'
def heaType(x):
    if x=='individual': return 0
    elif x=='district':return 1
    elif x=='central':return 2
    else : return 3
train[col] = train[col].apply(lambda row : heaType(row))
test[col] = test[col].apply(lambda row : heaType(row))

col = 'front_door_structure'
def FDS(x):
    if x=='stairway': return 0
    elif x=='corridor': return 2
    elif x=='mixed': return 1
    else : return 3
train[col] = train[col].apply(lambda row : FDS(row))
test[col] = test[col].apply(lambda row : FDS(row))

In [3]:
# 평당가를 활용한 특성 공학
train['unitPrice'] = train.apply(lambda row : row['transaction_real_price']/row['supply_area'], axis=1)

# 평당가 구하기
unitPriceCnvrt = {}
groupedTowns = train[['cityDisTown','unitPrice']].groupby(['cityDisTown']).agg('median')
for idx, price in zip(groupedTowns['unitPrice'].index, groupedTowns['unitPrice'].values):
    unitPriceCnvrt[idx] = price
def townUnitPrice(x):
    try : return unitPriceCnvrt[x]
    except KeyError : return np.nan
train['townUnitPrice'] = train['cityDisTown'].apply(lambda row : townUnitPrice(row))
test['townUnitPrice'] = test['cityDisTown'].apply(lambda row : townUnitPrice(row))
# 단지 평당가를 구하자 :
groupedUnitPrice = train[['unitPrice', 'apartment_id']].groupby(['apartment_id']).agg('median')
apartUnitPriceCnvrt = {}
for apartId, price in zip(groupedUnitPrice['unitPrice'].index, groupedUnitPrice['unitPrice'].values):
    apartUnitPriceCnvrt[apartId] = price
def apartUnitPrice(x):
    try : return apartUnitPriceCnvrt[x]
    except KeyError : return np.nan
train['apartUnitPrice'] = train['apartment_id'].apply(lambda row : apartUnitPrice(row))
test['apartUnitPrice'] = test['apartment_id'].apply(lambda row : apartUnitPrice(row))

In [24]:
# Subways

# 지하철 라인과 도시 구하기
subway['lines'] = subway['subway_line'].apply(lambda row : row.split(','))
subway['noLine'] = subway['subway_line'].apply(lambda row : len(row.split(','))) # 환승역일 경우, 역 갯수
subway['city'] = subway['lines'].apply(lambda row : 0 if row[0][0]=='B' else 1)
def disTown(x):
    try : return int(str(x['address_by_law'])[2:8])
    except ValueError : return np.nan
subway['disTown'] = subway.apply(lambda row : disTown(row), axis=1)
def cityDisTown(x):
    try:town=int(x['disTown'])
    except ValueError: return np.nan
    town = str(town)
    city = str(x['city'])
    return int(city+town)
subway['cityDisTown'] = subway.apply(lambda row : cityDisTown(row), axis=1)

# 거리 구하기 - 위도 1도 : 110,000m, 경도 1도 : 88,740m
apartLoc = test[['apartment_id', 'latitude', 'longitude']].groupby(['apartment_id']).agg('mean')
apartIdx = apartLoc.index.tolist()
apartLoc = apartLoc.values
statLoc = subway[['station_id', 'latitude', 'longitude']].groupby(['station_id']).agg('mean')
statIdx = statLoc.index.tolist()
statLoc = statLoc.values
station= dict(zip(statIdx, statLoc))
apartment = dict(zip(apartIdx, apartLoc))
subDist = {a:{s:None for s in statIdx} for a in apartIdx} # row(0) : apart, col(1) : station
for s in tqdm(statIdx):
    for a in apartIdx:
        width = abs(station[s][0] - apartment[a][0])*110000
        height = abs(station[s][1] - apartment[a][1])*88740
        dist = math.sqrt(width**2+height**2)
        subDist[a][s] = dist
# ### 200, 500, 800 이내 구하기
subLines = pd.Series(subway['lines'], index=subway['station_id'])
sub800 = {a:0 for a in subDist.keys()}
for a in subDist.keys():
    sub800[a] += sum(len(subLines[s]) for s, dist in zip(subDist[a].keys(), subDist[a].values()) if dist<=800 and type(subLines[s]) == list)
### 데이터에 삽입
train['sub800'] = train['apartment_id'].apply(lambda row : sub800[row])
test['sub800'] = test['apartment_id'].apply(lambda row : sub800[row])

# 지하철 가격 구하기
unitPriceCnvrt = {}
groupedTowns = train[['townUnitPrice', 'apartment_id']].groupby(['apartment_id']).agg('median')
for idx, price in zip(groupedTowns['townUnitPrice'].index, groupedTowns['townUnitPrice'].values):
    unitPriceCnvrt[idx] = price
def subwayUnitPrice(x):
    try : return unitPriceCnvrt[x]
    except KeyError : return np.nan
    except TypeError : return np.nan
subway['unitPrice'] = subway['cityDisTown'].apply(lambda row : subwayUnitPrice(row))

## 지하철역 가격으로 환산(동네 평균 평당가)

sub800Price = {a:0 for a in subDist.keys()}
subPrice = {}
for k, v in zip(subway['station_id'].tolist(),subway['unitPrice'].tolist()):
    subPrice[k] = v
for a in subDist.keys():
    sub800Price[a] += max(subPrice[s] if dist<=800 and type(subLines[s]) == list and not np.isnan(subPrice[s]) else 0 for s, dist in zip(subDist[a].keys(), subDist[a].values()))
train['sub800Price'] = train['apartment_id'].apply(lambda row : sub800Price[row])
test['sub800Price'] = test['apartment_id'].apply(lambda row : sub800Price[row])

In [None]:
# Schools

def classCnvrt(x):
    if x =='elementary': return 0
    elif x =='middle' : return 1
    else : return 2
school['school_class'] = school['school_class'].apply(lambda row : classCnvrt(row))

def opCnvrt(x):
    if x == 'public' : return 0
    elif x=='private' : return 2
    else : return 1
school['operation_type'] = school['operation_type'].apply(lambda row : opCnvrt(row))

def highCnvrt(x):
    if x=='general' : return 0
    elif x=='specialized' : return 1
    elif x=='autonomous' : return 2
    elif x=='objective' : return 3
school['highschool_type'] = school['highschool_type'].apply(lambda row : highCnvrt(row))

school['onlySex'] = (school['gender']!='both')
def genderCnvrt(x):
    if x=='both' : return 2
    elif x=='male' : return 1
    else : return 0
school['gender'] = school['gender'].apply(lambda row : genderCnvrt(row))

school['city'] = school['latitude'].apply(lambda row : 1 if row>36.5 else 0)
school['cityDisTown'] = school.apply(lambda row : int(str(row['city'])+str(row['address_by_law'])[2:8]), axis=1)

# 학교 거리 구하기
schooLoc = school[['school_code', 'latitude', 'longitude']].groupby(['school_code']).agg('mean')
schoolIdx = schooLoc.index.tolist()
schooLoc = schooLoc.values
schooLoc= dict(zip(schoolIdx, schooLoc))
apartLoc = test[['apartment_id', 'latitude', 'longitude']].groupby(['apartment_id']).agg('mean')
apartIdx = apartLoc.index.tolist()
apartLoc = apartLoc.values
apartment = dict(zip(apartIdx, apartLoc))
schDist = {a:{s:None for s in schoolIdx} for a in apartIdx} # row(0) : apart, col(1) : station
for s in tqdm(schoolIdx):
    for a in apartIdx:
        width = abs(schooLoc[s][0] - apartment[a][0])*110000
        height = abs(schooLoc[s][1] - apartment[a][1])*88740
        dist = math.sqrt(width**2+height**2)
        schDist[a][s] = dist
        
schoolCode = {}
for k, v in zip(school['school_code'].tolist(), school['school_class'].tolist()):
    schoolCode[k] = v
    
def elementSchool(x):
    dic = schDist[x]
    ret = 0
    for sch, dist in zip(dic.keys(), dic.values()):
        if dist<=800 and schoolCode[sch]==0: ret+=1
    return ret
def middleSchool(x):
    dic = schDist[x]
    ret = 0
    for sch, dist in zip(dic.keys(), dic.values()):
        if dist<=800 and schoolCode[sch]==1: ret+=1
    return ret
def highSchool(x):
    dic = schDist[x]
    ret = 0
    for sch, dist in zip(dic.keys(), dic.values()):
        if dist<=800 and schoolCode[sch]==2: ret+=1
    return ret
train['elementSchool800'] = train['apartment_id'].apply(lambda row : elementSchool(row))
train['middleSchool800'] = train['apartment_id'].apply(lambda row : middleSchool(row))
train['highSchool800'] = train['apartment_id'].apply(lambda row : highSchool(row))
test['elementSchool800'] = test['apartment_id'].apply(lambda row : elementSchool(row))
test['middleSchool800'] = test['apartment_id'].apply(lambda row : middleSchool(row))
test['highSchool800'] = test['apartment_id'].apply(lambda row : highSchool(row))

## 학교 가치구하기 pass

## 여기서부터는 lgb를 활용한 null 채워넣는 실험
1. 일단 쌩으로 lgb를 돌린다
2. importance가 10개 미만으로 분류한 컬럼들을 제외하고 lgb를 다시 돌린다.
3. [2]의 모델로 null값 추정

In [3]:
# Null1
def coList(col=None):
    cols = [
     'apartment_id','year_of_completion', 'exclusive_use_area', 'floor',
       'address_by_law','total_parking_capacity_in_site', 'total_household_count_in_sites',
       'apartment_building_count_in_sites', 'tallest_building_in_sites',
       'lowest_building_in_sites', 'heat_type', 'heat_fuel', 'room_id',
       'supply_area', 'total_household_count_of_area_type', 'room_count',
       'bathroom_count', 'front_door_structure', 'transYear', 'transMonth',
       'transOrdered', 'commonArea', 'areaRate',
       'cityDisTown', 'sub800',
       'elementSchool800', 'middleSchool800', 'highSchool800',
        'logTownUnitPrice', 'logApartUnitPrice', 'logSub800Price',
    ]
    cats = ['apartment_id','address_by_law','heat_type', 'heat_fuel', 'room_id','front_door_structure',
           'transMonth', 'cityDisTown',]
    if col is not None : 
        cols.remove(col)
        if col in cats: cats.remove(col)
    return cols, cats
    
cols, cats = coList()
data = train[cols].append(test[cols])

In [5]:
%%time
# Null2
def trainLgb(col, model=None):
    cols, cats = coList(col)
    trainSet = data[~pd.isna(data[col])]
    label = trainSet[col].values
    lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
    bst = lgb.train(params, lgbSet)
    
    for i,v in enumerate(bst.feature_importance('gain')):
        if v<10: cols[i]=0
    cols = [i for i in cols if i!=0]
    cats = list(set(cols)&set(cats))

    lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
    bst = lgb.train(params, lgbSet)
    if model is not None : bst.save_model(model+'.txt')
    
    predSet = train[pd.isna(train[col])]
    idx = predSet.index.tolist()
    predArray = predSet[cols]
    predicted = bst.predict(predArray.values)
    for i, v in enumerate(idx):
        train[col][v] = predicted[i] # 이 부분이 느린데, 어떻게 하면 대입 연산을 빠르게 할 수 있을까?
        
    predSet = test[pd.isna(test[col])]
    idx = predSet.index.tolist()
    predArray = predSet[cols]
    predicted = bst.predict(predArray.values)
    for i, v in enumerate(idx):
        test[col][v] = predicted[i] # 이 부분이 느린데, 어떻게 하면 대입 연산을 빠르게 할 수 있을까?

def applyRound(x):
    if x-int(x)>0.5: return int(x)+1
    else : return int(x)
    
params = {
    'objective':'regression',
    'metric':'l2',
    'num_threads':3,
}

Wall time: 0 ns


In [6]:
col = 'room_count'
trainLgb(col)
train[col] = train[col].apply(lambda row : applyRound(row))
test[col] = test[col].apply(lambda row : applyRound(row))

In [7]:
col = 'bathroom_count'
trainLgb(col)
train[col] = train[col].apply(lambda row : applyRound(row))
test[col] = test[col].apply(lambda row : applyRound(row))

In [None]:
col = 'total_parking_capacity_in_site'
trainLgb(col)
train[col] = train[col].apply(lambda row : applyRound(row))
test[col] = test[col].apply(lambda row : applyRound(row))

In [None]:
col = 'tallest_building_in_sites'
trainLgb(col)
train[col] = train[col].apply(lambda row : applyRound(row))
test[col] = test[col].apply(lambda row : applyRound(row))

In [None]:
col = 'front_door_structure'
params = {
    'objective':'multiclass',
    'num_class':3,
    'num_threads':3,
}
cols, cats = coList(col)
trainSet = data[data[col]!=3]
label = trainSet[col].values
lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
bst = lgb.train(params, lgbSet)
for i,v in enumerate(bst.feature_importance('gain')):
    if v<10: cols[i]=0
cols = [i for i in cols if i!=0]
cats = list(set(cols)&set(cats))
lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
bst = lgb.train(params, lgbSet)
predSet = train[train[col]==3]
idx = predSet.index.tolist()
predArray = predSet[cols]
predicted = bst.predict(predArray.values)
for i, v in enumerate(idx):
    train[col][v] = predicted[i].tolist().index(max(predicted[i]))
predSet = test[test[col]==3]
idx = predSet.index.tolist()
predArray = predSet[cols]
predicted = bst.predict(predArray.values)
for i, v in enumerate(idx):
    test[col][v] = predicted[i].tolist().index(max(predicted[i]))

In [None]:
col = 'heat_type'
params = {
    'objective':'multiclass',
    'num_class':3,
    'num_threads':3,
}
cols, cats = coList(col)
trainSet = data[data[col]!=3]
label = trainSet[col].values
lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
bst = lgb.train(params, lgbSet)
for i,v in enumerate(bst.feature_importance('gain')):
    if v<10: cols[i]=0
cols = [i for i in cols if i!=0]
cats = list(set(cols)&set(cats))
lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
bst = lgb.train(params, lgbSet)
predSet = train[train[col]==3]
idx = predSet.index.tolist()
predArray = predSet[cols]
predicted = bst.predict(predArray.values)    
for i, v in enumerate(idx):
    train[col][v] = predicted[i].tolist().index(max(predicted[i]))
predSet = test[test[col]==3]
idx = predSet.index.tolist()
predArray = predSet[cols]
predicted = bst.predict(predArray.values)    
for i, v in enumerate(idx):
    test[col][v] = predicted[i].tolist().index(max(predicted[i]))

In [None]:
col = 'heat_fuel'
params = {
    'objective':'binary',
    'num_threads':3,
}
cols, cats = coList(col)
trainSet = data[data[col]!=2]
label = trainSet[col].values
lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
bst = lgb.train(params, lgbSet)
for i,v in enumerate(bst.feature_importance('gain')):
    if v<10: cols[i]=0
cols = [i for i in cols if i!=0]
cats = list(set(cols)&set(cats))
lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
bst = lgb.train(params, lgbSet)
predSet = train[train[col]==2]
idx = predSet.index.tolist()
predArray = predSet[cols]
predicted = bst.predict(predArray.values)    
for i, v in enumerate(idx):
    train[col][v] = int(predicted[i]>0.5)
predSet = test[test[col]==2]
idx = predSet.index.tolist()
predArray = predSet[cols]
predicted = bst.predict(predArray.values)    
for i, v in enumerate(idx):
    test[col][v] = int(predicted[i]>0.5)

In [None]:
%%time
col = 'logApartUnitPrice'
params = {
    'num_threads':3,
}
cols, cats = coList(col)
cols.remove('apartment_id')
cats.remove('apartment_id')
trainSet = data[~pd.isna(data[col])]
label = trainSet[col].values
lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
bst = lgb.train(params, lgbSet)

In [None]:
%%time
for i,v in enumerate(bst.feature_importance('gain')):
    if v<10: cols[i]=0
cols = [i for i in cols if i!=0]
cats = list(set(cols)&set(cats))
lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
bst = lgb.train(params, lgbSet)

In [None]:
predSet = train[pd.isna(train[col])]
idx = predSet.index.tolist()
predArray = predSet[cols]
predicted = bst.predict(predArray.values)
for i, v in enumerate(idx):
    train[col][v] = predicted[i]

In [None]:
predSet = test[pd.isna(test[col])]
idx = predSet.index.tolist()
predArray = predSet[cols]
predicted = bst.predict(predArray.values)
for i, v in enumerate(idx):
    test[col][v] = predicted[i]

In [None]:
%%time
col = 'townUnitPrice'
params = {
    'num_threads':3,
}
cols, cats = coList(col)
cols.remove('cityDisTown')
cats.remove('cityDisTown')
trainSet = data[~pd.isna(data[col])]
label = trainSet[col].values
lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
bst = lgb.train(params, lgbSet)
for i,v in enumerate(bst.feature_importance('gain')):
    if v<10: cols[i]=0
cols = [i for i in cols if i!=0]
cats = list(set(cols)&set(cats))
lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
bst = lgb.train(params, lgbSet)
predSet = train[pd.isna(train[col])]
idx = predSet.index.tolist()
predArray = predSet[cols]
predicted = bst.predict(predArray.values)
for i, v in enumerate(idx):
    train[col][v] = predicted[i]
predSet = test[pd.isna(test[col])]
idx = predSet.index.tolist()
predArray = predSet[cols]
predicted = bst.predict(predArray.values)
for i, v in enumerate(idx):
    test[col][v] = predicted[i]

In [None]:
%%time
col = 'sub800Price'
params = {
    'num_threads':3,
}
cols, cats = coList(col)
trainSet = data[~pd.isna(data[col])]
label = trainSet[col].values
lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
bst = lgb.train(params, lgbSet)
for i,v in enumerate(bst.feature_importance('gain')):
    if v<10: cols[i]=0
cols = [i for i in cols if i!=0]
cats = list(set(cols)&set(cats))
lgbSet = lgb.Dataset(trainSet[cols], label=label, categorical_feature=cats)
bst = lgb.train(params, lgbSet)
predSet = train[pd.isna(train[col])]
idx = predSet.index.tolist()
predArray = predSet[cols]
predicted = bst.predict(predArray.values)
for i, v in enumerate(idx):
    train[col][v] = predicted[i]
predSet = test[pd.isna(test[col])]
idx = predSet.index.tolist()
predArray = predSet[cols]
predicted = bst.predict(predArray.values)
for i, v in enumerate(idx):
    test[col][v] = predicted[i]

## null값을 채운 뒤 새로운 특성을 만들어 보자

In [25]:
train['logTownUnitPrice'] = np.log(train['townUnitPrice'])
test['logTownUnitPrice'] = np.log(test['townUnitPrice'])

train['logApartUnitPrice'] = np.log(train['apartUnitPrice'])
test['logApartUnitPrice'] = np.log(test['apartUnitPrice'])

train['logSub800Price'] = np.log(train['sub800Price'])
test['logSub800Price'] = np.log(test['sub800Price'])

In [None]:
# 단지별 평당가로 줄세워서 인트 카테고리
data = train.append(test)
groupedUnitPrice = data[['unitPrice', 'apartment_id']].groupby(['apartment_id']).agg('median')
apartUnitPriceCnvrt = {}
for apartId, price in zip(groupedUnitPrice['unitPrice'].index, groupedUnitPrice['unitPrice'].values):
    apartUnitPriceCnvrt[apartId] = price
    
sortedApart = sorted(apartUnitPriceCnvrt.keys(), key=lambda k : uniqUnitPriceCnvrt[k])
uniqUnitPriceCnvrt = {}
for v, k in enumerate(sortedApart):
    uniqUnitPriceCnvrt[k] = v
    
train['apart'] = train['apartment_id'].apply(lambda row : uniqUnitPriceCnvrt[row])
test['apart'] = test['apartment_id'].apply(lambda row : uniqUnitPriceCnvrt[row])