# 1. error_data_preprocessing

- 대회 데이터 셋 오류로 인한 데이터 전처리
  - https://dacon.io/competitions/official/235745/talkboard/403708?page=1&dtype=recent

In [1]:
import pandas as pd
import numpy as np
import math
import time
import os
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [2]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

train.shape, test.shape

((2952, 15), (1022, 14))

In [3]:
train.columns

Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
       '도보 10분거리 내 버스정류장 수', '단지내주차면수', '등록차량수'],
      dtype='object')

In [4]:
train.columns = ['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '10분내지하철수',
       '10분내버스정류장수', '단지내주차면수', '등록차량수']

test.columns = ['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '10분내지하철수',
       '10분내버스정류장수', '단지내주차면수']

### 데이터 오류로 인한 데이터 제외
- 테스트셋에서 평가 제외되는 데이터는 'C2675'(2번 사항에 해당), 'C2335', 'C1327'(3번 사항에 해당) 3개 단지입니다.

In [5]:
train.isna().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            0
임대보증금         569
임대료           569
10분내지하철수      211
10분내버스정류장수      4
단지내주차면수         0
등록차량수           0
dtype: int64

In [6]:
train.단지코드.unique()

array(['C2483', 'C2515', 'C1407', 'C1945', 'C1470', 'C1898', 'C1244',
       'C1171', 'C2073', 'C2513', 'C1936', 'C2049', 'C2202', 'C1925',
       'C2576', 'C1312', 'C1874', 'C2650', 'C2416', 'C2013', 'C1424',
       'C2100', 'C2621', 'C2520', 'C2319', 'C1616', 'C1704', 'C2258',
       'C1032', 'C2038', 'C1859', 'C1722', 'C1850', 'C2190', 'C1476',
       'C1077', 'C1068', 'C1983', 'C2135', 'C2034', 'C1109', 'C1497',
       'C2289', 'C2597', 'C2310', 'C1672', 'C2132', 'C1439', 'C1613',
       'C2216', 'C1899', 'C1056', 'C2644', 'C1206', 'C2481', 'C1718',
       'C1655', 'C1430', 'C1775', 'C1519', 'C2221', 'C1790', 'C2109',
       'C1698', 'C1866', 'C1005', 'C1004', 'C1875', 'C2156', 'C2212',
       'C2401', 'C2571', 'C1175', 'C1833', 'C2445', 'C1885', 'C2368',
       'C2016', 'C2371', 'C2536', 'C2538', 'C1014', 'C1592', 'C1867',
       'C2326', 'C1015', 'C1620', 'C1049', 'C2000', 'C2097', 'C1668',
       'C1689', 'C1234', 'C2514', 'C1368', 'C1057', 'C2336', 'C1026',
       'C2256', 'C19

In [7]:
# 우선 train 데이터 셋 확인
train.loc[ ((train['단지코드']=='C2675') | 
           (train['단지코드']=='C2335') |
           (train['단지코드']=='C1327') ) , :]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수,등록차량수


In [8]:
# 테스트 데이터 셋 확인
test.loc[ ((test['단지코드']=='C2675') | 
           (test['단지코드']=='C2335') |
           (test['단지코드']=='C1327') ) , :].head(3)

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수
579,C2675,512,아파트,경기도,국민임대,36.65,130,9.0,A,18476000,154790,0.0,3.0,1016.0
580,C2675,512,아파트,경기도,국민임대,46.9,44,9.0,A,34082000,232200,0.0,3.0,1016.0
581,C2675,512,아파트,경기도,국민임대,46.9,80,9.0,A,34082000,232200,0.0,3.0,1016.0


In [9]:
# 테스트 데이터 셋에서 세개의 코드 데이터를 없애기
test = test.loc[ ~((test['단지코드']=='C2675') | 
           (test['단지코드']=='C2335') |
           (test['단지코드']=='C1327') ) , :]
test.head()

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수
0,C1072,754,아파트,경기도,국민임대,39.79,116,14.0,H,22830000,189840,0.0,2.0,683.0
1,C1072,754,아파트,경기도,국민임대,46.81,30,14.0,A,36048000,249930,0.0,2.0,683.0
2,C1072,754,아파트,경기도,국민임대,46.9,112,14.0,H,36048000,249930,0.0,2.0,683.0
3,C1072,754,아파트,경기도,국민임대,46.9,120,14.0,H,36048000,249930,0.0,2.0,683.0
4,C1072,754,아파트,경기도,국민임대,51.46,60,14.0,H,43497000,296780,0.0,2.0,683.0


In [10]:
# 확인
test.loc[ ((test['단지코드']=='C2675') | 
           (test['단지코드']=='C2335') |
           (test['단지코드']=='C1327') ) , :]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수


### 오류 데이터 처리
- ※ 동일한 단지에 코드가 2개로 부여된 단지 코드 (3쌍) : ['C2085', 'C1397'], ['C2431', 'C1649'], ['C1036', 'C2675']
    - (참고 사항) 주차면수는 하나의 단지임을 전제로 산정된 것이고 총세대수는 두 개 단지의 합계입니다. 다만 등록차량대수는 ['C2085', 'C1397'] 단지의 경우 동일 수치

In [11]:
train.loc[ train['단지코드']=='C2085',  "총세대수" ] = 1339
train.loc[ train['단지코드']=='C1397',  "총세대수" ] = 1339

In [12]:
# 단지코드를 C2085,C1397 => N2085로 변경
print( train.loc[ train['단지코드']=='C2085', : ].shape  )
print( train.loc[ train['단지코드']=='C1397', : ].shape  )

(8, 15)
(6, 15)


In [13]:
# 변경 후, 처리 후, 단지코드를 N을 붙여 N2085로 변경
train.loc[ train['단지코드']=='C2085',  "단지코드" ] = 'N2085'
train.loc[ train['단지코드']=='C1397',  "단지코드" ] = 'N2085'

In [14]:
train.loc[ train['단지코드']=='N2085', : ].shape

(14, 15)

### 오류 코드 변경
- C2431, C1649의 총세대수를 1047로 변경
- C2431, C1649의 등록차량대수를 1214로 변경
- C2431, C1649의 단지코드를 N2431로 변경

In [15]:
a = train.loc[ train['단지코드']=='C2431', : ]
b = train.loc[ train['단지코드']=='C1649', : ]

print(  a.shape, b.shape )
print( a['총세대수'], b['총세대수'])
print( a['등록차량수'], b['등록차량수'])

(2, 15) (4, 15)
2372    472
2373    472
Name: 총세대수, dtype: int64 2315    575
2316    575
2317    575
2318    575
Name: 총세대수, dtype: int64
2372    359.0
2373    359.0
Name: 등록차량수, dtype: float64 2315    855.0
2316    855.0
2317    855.0
2318    855.0
Name: 등록차량수, dtype: float64


In [16]:
train.loc[ train['단지코드']=='C2431',  "총세대수" ] = 1047
train.loc[ train['단지코드']=='C1649',  "총세대수" ] = 1047

train.loc[ train['단지코드']=='C2431',  "등록차량수" ] = 1214
train.loc[ train['단지코드']=='C1649',  "등록차량수" ] = 1214

train.loc[ train['단지코드']=='C2431',  "단지코드" ] = 'N2431'
train.loc[ train['단지코드']=='C1649',  "단지코드" ] = 'N2431'

In [17]:
train.loc[ train['단지코드']=='N2431', : ].shape

(6, 15)

### 오류 코드 변경
- C1036의 총세대수를 1243로 변경
- C1036의 단지코드를 N1036로 변경

In [18]:
a = train.loc[ train['단지코드']=='C2675', : ]
b = train.loc[ train['단지코드']=='C1036', : ]
a.shape, b.shape

((0, 15), (7, 15))

In [19]:
train.loc[ train['단지코드']=='C1036',  "총세대수" ] = 1243
train.loc[ train['단지코드']=='C1036',  "단지코드" ] = 'N1036'

In [20]:
train.loc[ train['단지코드']=='N1036', : ].shape

(7, 15)

### 오류 3
3. 단지코드 등 기입 실수로 데이터 정제 과정에서 매칭 오류 발생  
 - (오류 내용) 단지코드 등 기입 실수로 총세대수가 주차면수에 비해 과하게 많거나 적은 경우가 발생하였고, 점검 결과 일부 데이터의 단지코드, 총세대수, 주차면수 등에서 오류가 검출되었습니다.
 - (발생 원인) 원천데이터 수집 과정에서 단지 코드 등이 잘못 기입되었고 이를 인지하지 못한 채 데이터 정제를 하여 오류가 발생하였습니다.
 - (관련 데이터) 아래와 같이 총 9개 단지에서 같은 문제가 확인되었습니다. 
※ 실수가 발생한 단지 코드 (9개 단지) : ['C2335', 'C1327', 'C1095', 'C2051', 'C1218', 'C1894', 'C2483', 'C1502', 'C1988']
 - C2335, C1327 단지는 테스트셋, 나머지는 트레인셋 입니다.

### 오류 처리
- train 데이터 셋에 오류 발생 코드를 ERR04로 변경 후, 데이터 셋을 두개로 분리

In [21]:
train.loc[ train['단지코드'].str.contains('ERR'), :].shape

(0, 15)

In [22]:
train.loc[ train['단지코드'].str.contains('ERR'), :]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수,등록차량수


In [23]:
train_df = train.copy()
train_df_errno = train.loc[ ~train['단지코드'].str.contains('ERR'), :]
test_df = test.copy()

In [24]:
train_df_errno.reset_index(inplace=True)

In [25]:
train_df.shape, train_df_errno.shape, test_df.shape

((2952, 15), (2952, 16), (1008, 14))

### 오류 1 전용면적별 세대의 합계와 총세대수가 일치하지 않는 오류
- 차이가 14세대 이하인 48개 단지 - ['C1925', 'C1312', 'C2013', 'C1424', 'C2520', 'C2319', 'C1850', 'C1068', 'C2644', 'C2156', C2453', 'C1910', 'C2139', 'C2508', 'C1695', 'C2556', 'C2362', 'C2568', 'C2245', 'C2549', 'C1584', 'C2298', 'C2225', 'C1218', 'C1970', C1732', 'C2433', 'C1894', 'C1156', 'C2142', 'C2186', 'C2411', 'C1812', 'C1030', 'C1749', 'C1349', 'C2043', 'C1229', 'C2363', 'C1414', C2174', 'C2404', 'C1683', 'C1038', 'C2456', 'C1266', 'C1267', 'C2189']
- 차이가 94~452세대인 10개 단지(크기순) - ['C1490', 'C2497', 'C2620', 'C1344', 'C1024', 'C2470', 'C1206', 'C1740', 'C2405', 'C1804’]

In [26]:
# 데이터 병합
all_df = pd.concat([train_df_errno, test_df], join='inner', ignore_index=True)
all_df

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수
0,C2483,900,아파트,경상북도,국민임대,39.72,134,38.0,A,15667000,103680,0.0,3.0,1425.0
1,C2483,900,아파트,경상북도,국민임대,39.72,15,38.0,A,15667000,103680,0.0,3.0,1425.0
2,C2483,900,아파트,경상북도,국민임대,51.93,385,38.0,A,27304000,184330,0.0,3.0,1425.0
3,C2483,900,아파트,경상북도,국민임대,51.93,15,38.0,A,27304000,184330,0.0,3.0,1425.0
4,C2483,900,아파트,경상북도,국민임대,51.93,41,38.0,A,27304000,184330,0.0,3.0,1425.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,C1267,675,아파트,경상남도,행복주택,36.77,126,38.0,L,-,-,0.0,1.0,467.0
3956,C2189,382,아파트,전라북도,국민임대,29.19,96,45.0,H,6872000,106400,0.0,2.0,300.0
3957,C2189,382,아파트,전라북도,국민임대,29.19,20,45.0,H,6872000,106400,0.0,2.0,300.0
3958,C2189,382,아파트,전라북도,국민임대,39.45,202,45.0,H,13410000,144600,0.0,2.0,300.0


In [27]:
# 오류1 14세대 이하 처리
group1 = ['C1925', 'C1312', 'C2013', 'C1424', 'C2520', 'C2319', 'C1850', 'C1068', 'C2644', 'C2156', 
          'C2453', 'C1910', 'C2139', 'C2508', 'C1695', 'C2556', 'C2362', 'C2568', 'C2245', 'C2549', 
          'C1584', 'C2298', 'C2225', 'C1218', 'C1970', 'C1732', 'C2433', 'C1894', 'C1156', 'C2142', 
          'C2186', 'C2411', 'C1812', 'C1030', 'C1749', 'C1349', 'C2043', 'C1229', 'C2363', 'C1414', 
          'C2174', 'C2404', 'C1683', 'C1038', 'C2456', 'C1266', 'C1267', 'C2189' ]

for one in group1:
    all_df.loc[ all_df['단지코드'] == one, "단지코드_Type" ] = 1

In [28]:
# 오류1 94~452세대 10개 단지 처리
group2 = ['C1490', 'C2497', 'C2620', 'C1344', 'C1024', 'C2470', 'C1206', 'C1740', 'C2405', 'C1804']

for one in group2:
    all_df.loc[ all_df['단지코드'] == one, "단지코드_Type" ] = 2

In [29]:
# 차이가 94~452세대인 10개단지 처리하기
all_df.loc[  all_df['단지코드_Type'].isna(), "단지코드_Type"] = 3
all_df['단지코드_Type'].unique()

array([3., 1., 2.])

### 결측치 처리

In [30]:
all_df.isnull().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            2
임대보증금         749
임대료           749
10분내지하철수      249
10분내버스정류장수      4
단지내주차면수         0
단지코드_Type       0
dtype: int64

In [31]:
all_df.loc[all_df['자격유형'].isnull()]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type
3148,C2411,962,아파트,경상남도,국민임대,46.9,240,25.0,,71950000,37470,0.0,2.0,840.0,1.0
3210,C2253,1161,아파트,강원도,영구임대,26.37,745,0.0,,2249000,44770,0.0,2.0,173.0,3.0


In [32]:
all_df.loc[ 3148, "자격유형"] = 'A'
all_df.loc[ 3210, "자격유형"] = 'C'

In [33]:
all_df.isnull().sum()

단지코드            0
총세대수            0
임대건물구분          0
지역              0
공급유형            0
전용면적            0
전용면적별세대수        0
공가수             0
자격유형            0
임대보증금         749
임대료           749
10분내지하철수      249
10분내버스정류장수      4
단지내주차면수         0
단지코드_Type       0
dtype: int64

In [34]:
# 10분내 버스 정류장수 결측치 처리
grouped = all_df.groupby(['임대건물구분', '지역'])
group1 = grouped.get_group( ('아파트', '경상남도')  )
group1['10분내버스정류장수'].mean()

3.9830028328611897

In [35]:
val = group1['10분내버스정류장수'].mean()
all_df.loc[ all_df['10분내버스정류장수'].isnull(), "10분내버스정류장수"] = val
all_df.loc[all_df['10분내버스정류장수'].isnull()]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type


In [36]:
# 라벨 인코딩 - 문자를 숫자로 변경
gubun1 = {'아파트':1, '상가':2}

gubun2 = {'경상남도':1, '대전광역시':2, '경기도':3, '전라북도':4, 
          '강원도':5, '광주광역시':6, '충청남도':7, '부산광역시':8, 
          '제주특별자치도':9, '울산광역시':10, '충청북도':11, '전라남도':12, 
          '경상북도':13, '대구광역시':14, '서울특별시':15, '세종특별자치시':16}

gubun3 = {'국민임대':1, '공공임대(50년)':2, '영구임대':3, '임대상가':4, 
          '공공임대(10년)':5, '공공임대(분납)':6, '장기전세':7, '공공분양':8, 
          '행복주택':9, '공공임대(5년)':10}

gubun4 = { 'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 
           'F':6, 'G':7, 'H':8, 'I':9, 'J':10, 
           'K':11, 'L':12, 'M':13, 'N':14, 'O':15  }

all_df['임대건물구분_lbl'] = all_df['임대건물구분'].map(gubun1)
all_df['지역_lbl'] = all_df['지역'].map(gubun2)
all_df['공급유형_lbl'] = all_df['공급유형'].map(gubun3)
all_df['자격유형_lbl'] = all_df['자격유형'].map(gubun4).astype(int)

all_df

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl
0,C2483,900,아파트,경상북도,국민임대,39.72,134,38.0,A,15667000,103680,0.0,3.0,1425.0,3.0,1,13,1,1
1,C2483,900,아파트,경상북도,국민임대,39.72,15,38.0,A,15667000,103680,0.0,3.0,1425.0,3.0,1,13,1,1
2,C2483,900,아파트,경상북도,국민임대,51.93,385,38.0,A,27304000,184330,0.0,3.0,1425.0,3.0,1,13,1,1
3,C2483,900,아파트,경상북도,국민임대,51.93,15,38.0,A,27304000,184330,0.0,3.0,1425.0,3.0,1,13,1,1
4,C2483,900,아파트,경상북도,국민임대,51.93,41,38.0,A,27304000,184330,0.0,3.0,1425.0,3.0,1,13,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,C1267,675,아파트,경상남도,행복주택,36.77,126,38.0,L,-,-,0.0,1.0,467.0,1.0,1,1,9,12
3956,C2189,382,아파트,전라북도,국민임대,29.19,96,45.0,H,6872000,106400,0.0,2.0,300.0,1.0,1,4,1,8
3957,C2189,382,아파트,전라북도,국민임대,29.19,20,45.0,H,6872000,106400,0.0,2.0,300.0,1.0,1,4,1,8
3958,C2189,382,아파트,전라북도,국민임대,39.45,202,45.0,H,13410000,144600,0.0,2.0,300.0,1.0,1,4,1,8


In [37]:
# 구간분할 해주기(총세대수에 대해서)
all_df['단지코드'] = all_df['단지코드'].astype("category")
all_df['단지코드_lbl'] = all_df['단지코드'].cat.codes

all_df['qcut_총세대수'] = pd.qcut(all_df['총세대수'], 5, labels=False)
all_df['qcut_총세대수'].unique()

array([2, 1, 3, 4, 0], dtype=int64)

In [38]:
train_df_errno.shape, test_df.shape

((2952, 16), (1008, 14))

In [39]:
train_df_last = all_df.iloc[0:2896,:]
test_df_last = all_df.iloc[2896:,:]

train_df_last.shape, test_df_last.shape

((2896, 21), (1064, 21))

In [40]:
train_df_last = pd.concat([train_df_last, train_df_errno['등록차량수'] ], axis=1)
train_df_last

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수,등록차량수
0,C2483,900.0,아파트,경상북도,국민임대,39.72,134.0,38.0,A,15667000,...,3.0,1425.0,3.0,1.0,13.0,1.0,1.0,484.0,2.0,1015.0
1,C2483,900.0,아파트,경상북도,국민임대,39.72,15.0,38.0,A,15667000,...,3.0,1425.0,3.0,1.0,13.0,1.0,1.0,484.0,2.0,1015.0
2,C2483,900.0,아파트,경상북도,국민임대,51.93,385.0,38.0,A,27304000,...,3.0,1425.0,3.0,1.0,13.0,1.0,1.0,484.0,2.0,1015.0
3,C2483,900.0,아파트,경상북도,국민임대,51.93,15.0,38.0,A,27304000,...,3.0,1425.0,3.0,1.0,13.0,1.0,1.0,484.0,2.0,1015.0
4,C2483,900.0,아파트,경상북도,국민임대,51.93,41.0,38.0,A,27304000,...,3.0,1425.0,3.0,1.0,13.0,1.0,1.0,484.0,2.0,1015.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2947,,,,,,,,,,,...,,,,,,,,,,146.0
2948,,,,,,,,,,,...,,,,,,,,,,146.0
2949,,,,,,,,,,,...,,,,,,,,,,146.0
2950,,,,,,,,,,,...,,,,,,,,,,146.0


In [41]:
# 새로운 특징만들고 corr() 확인
train_df_last['log_등록차량수'] = np.log1p(train_df_last['등록차량수'])

print("등록차량수 상관계수 : ", train_df_last.corr()['등록차량수'])
print()
print("log_등록차량수 상관계수 ; ", train_df_last.corr()['log_등록차량수'])

등록차량수 상관계수 :  총세대수          0.318521
전용면적          0.118112
전용면적별세대수      0.245576
공가수           0.113449
10분내지하철수     -0.108009
10분내버스정류장수    0.094992
단지내주차면수       0.865047
단지코드_Type     0.113972
임대건물구분_lbl   -0.455012
지역_lbl        0.079022
공급유형_lbl     -0.146933
자격유형_lbl     -0.180370
단지코드_lbl     -0.058014
qcut_총세대수     0.393222
등록차량수         1.000000
log_등록차량수     0.881781
Name: 등록차량수, dtype: float64

log_등록차량수 상관계수 ;  총세대수          0.206936
전용면적          0.116937
전용면적별세대수      0.268104
공가수           0.192423
10분내지하철수     -0.125164
10분내버스정류장수    0.067445
단지내주차면수       0.806928
단지코드_Type     0.101020
임대건물구분_lbl   -0.612173
지역_lbl        0.129988
공급유형_lbl     -0.287306
자격유형_lbl     -0.260136
단지코드_lbl     -0.057132
qcut_총세대수     0.288893
등록차량수         0.881781
log_등록차량수     1.000000
Name: log_등록차량수, dtype: float64


### 데이터 오류 처리 후, csv파일을 만들기

In [42]:
train_df.to_csv("./data/train_df.csv", index=False)
train_df_last.to_csv("./data/train_df_errno1.csv", index=False)

test_df_last.to_csv("./data/test_df1.csv", index=False)

# 2. subway_missing_values_preprocessing

In [43]:
train = pd.read_csv("./Data/train_df_errno1.csv")
test = pd.read_csv("./Data/test_df1.csv")

In [44]:
all_df = pd.concat([train, test], join='inner', ignore_index=True)
all_df

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
0,C2483,900.0,아파트,경상북도,국민임대,39.72,134.0,38.0,A,15667000,...,0.0,3.0,1425.0,3.0,1.0,13.0,1.0,1.0,484.0,2.0
1,C2483,900.0,아파트,경상북도,국민임대,39.72,15.0,38.0,A,15667000,...,0.0,3.0,1425.0,3.0,1.0,13.0,1.0,1.0,484.0,2.0
2,C2483,900.0,아파트,경상북도,국민임대,51.93,385.0,38.0,A,27304000,...,0.0,3.0,1425.0,3.0,1.0,13.0,1.0,1.0,484.0,2.0
3,C2483,900.0,아파트,경상북도,국민임대,51.93,15.0,38.0,A,27304000,...,0.0,3.0,1425.0,3.0,1.0,13.0,1.0,1.0,484.0,2.0
4,C2483,900.0,아파트,경상북도,국민임대,51.93,41.0,38.0,A,27304000,...,0.0,3.0,1425.0,3.0,1.0,13.0,1.0,1.0,484.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4011,C1267,675.0,아파트,경상남도,행복주택,36.77,126.0,38.0,L,-,...,0.0,1.0,467.0,1.0,1.0,1.0,9.0,12.0,88.0,2.0
4012,C2189,382.0,아파트,전라북도,국민임대,29.19,96.0,45.0,H,6872000,...,0.0,2.0,300.0,1.0,1.0,4.0,1.0,8.0,387.0,0.0
4013,C2189,382.0,아파트,전라북도,국민임대,29.19,20.0,45.0,H,6872000,...,0.0,2.0,300.0,1.0,1.0,4.0,1.0,8.0,387.0,0.0
4014,C2189,382.0,아파트,전라북도,국민임대,39.45,202.0,45.0,H,13410000,...,0.0,2.0,300.0,1.0,1.0,4.0,1.0,8.0,387.0,0.0


In [45]:
pd.set_option('display.max_rows', 800)  # 중간 생략 없이 800개 행을 다 보여준다.

In [46]:
all_df.loc[all_df['10분내지하철수'].isnull()]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
94,C1312,518.0,아파트,충청남도,국민임대,39.72,60.0,12.0,A,17460000.0,...,,3.0,527.0,1.0,1.0,7.0,1.0,1.0,103.0,1.0
95,C1312,518.0,아파트,충청남도,국민임대,39.98,89.0,12.0,A,17460000.0,...,,3.0,527.0,1.0,1.0,7.0,1.0,1.0,103.0,1.0
96,C1312,518.0,아파트,충청남도,국민임대,41.55,225.0,12.0,A,19954000.0,...,,3.0,527.0,1.0,1.0,7.0,1.0,1.0,103.0,1.0
97,C1312,518.0,아파트,충청남도,국민임대,46.9,143.0,12.0,A,28687000.0,...,,3.0,527.0,1.0,1.0,7.0,1.0,1.0,103.0,1.0
98,C1874,619.0,아파트,충청남도,영구임대,26.37,294.0,2.0,C,3141000.0,...,,2.0,97.0,3.0,1.0,7.0,3.0,3.0,281.0,1.0
99,C1874,619.0,아파트,충청남도,영구임대,26.37,149.0,2.0,C,3141000.0,...,,2.0,97.0,3.0,1.0,7.0,3.0,3.0,281.0,1.0
100,C1874,619.0,아파트,충청남도,영구임대,31.32,149.0,2.0,C,3731000.0,...,,2.0,97.0,3.0,1.0,7.0,3.0,3.0,281.0,1.0
101,C1874,619.0,상가,충청남도,임대상가,12.62,1.0,2.0,D,,...,,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0
102,C1874,619.0,상가,충청남도,임대상가,17.4,1.0,2.0,D,,...,,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0
103,C1874,619.0,상가,충청남도,임대상가,17.4,1.0,2.0,D,,...,,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0


In [47]:
all_df['10분내지하철수'].unique()

array([ 0.,  1., nan,  2.,  3.])

In [48]:
all_df[all_df['10분내지하철수'].isnull()]['지역'].unique()

array(['충청남도', '대전광역시', '경상남도', nan], dtype=object)

In [49]:
all_df['임대건물구분'].unique()

array(['아파트', '상가', nan], dtype=object)

In [50]:
all_df[all_df['10분내지하철수'].isnull()]['공급유형'].unique()

array(['국민임대', '영구임대', '임대상가', '공공임대(50년)', '공공임대(10년)', '공공분양',
       '공공임대(분납)', nan], dtype=object)

In [51]:
grouped = all_df.groupby(all_df.loc[all_df['10분내지하철수'].isnull()]['지역'])
group1 = grouped.get_group('충청남도')
group2 = grouped.get_group('대전광역시')
group3 = grouped.get_group('경상남도')
group1  # 충청남도

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
94,C1312,518.0,아파트,충청남도,국민임대,39.72,60.0,12.0,A,17460000.0,...,,3.0,527.0,1.0,1.0,7.0,1.0,1.0,103.0,1.0
95,C1312,518.0,아파트,충청남도,국민임대,39.98,89.0,12.0,A,17460000.0,...,,3.0,527.0,1.0,1.0,7.0,1.0,1.0,103.0,1.0
96,C1312,518.0,아파트,충청남도,국민임대,41.55,225.0,12.0,A,19954000.0,...,,3.0,527.0,1.0,1.0,7.0,1.0,1.0,103.0,1.0
97,C1312,518.0,아파트,충청남도,국민임대,46.9,143.0,12.0,A,28687000.0,...,,3.0,527.0,1.0,1.0,7.0,1.0,1.0,103.0,1.0
98,C1874,619.0,아파트,충청남도,영구임대,26.37,294.0,2.0,C,3141000.0,...,,2.0,97.0,3.0,1.0,7.0,3.0,3.0,281.0,1.0
99,C1874,619.0,아파트,충청남도,영구임대,26.37,149.0,2.0,C,3141000.0,...,,2.0,97.0,3.0,1.0,7.0,3.0,3.0,281.0,1.0
100,C1874,619.0,아파트,충청남도,영구임대,31.32,149.0,2.0,C,3731000.0,...,,2.0,97.0,3.0,1.0,7.0,3.0,3.0,281.0,1.0
101,C1874,619.0,상가,충청남도,임대상가,12.62,1.0,2.0,D,,...,,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0
102,C1874,619.0,상가,충청남도,임대상가,17.4,1.0,2.0,D,,...,,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0
103,C1874,619.0,상가,충청남도,임대상가,17.4,1.0,2.0,D,,...,,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0


In [52]:
group1['공급유형'].unique()  # 10분내지하철수 값이 NaN인 충청남도 지역 임대건물 단지의 공급유형

array(['국민임대', '영구임대', '임대상가', '공공임대(50년)'], dtype=object)

In [53]:
grouped = group1.groupby(group1['공급유형'])
group11 = grouped.get_group('국민임대')
group12 = grouped.get_group('영구임대')
group13 = grouped.get_group('임대상가')
group14 = grouped.get_group('공공임대(50년)')

In [54]:
all_df[all_df['지역']=='충청남도'][all_df['공급유형']=='국민임대']['단지코드'].unique() # group11 (충청남도,국민임대)

  """Entry point for launching an IPython kernel.


array(['C1312', 'C1068', 'C1005', 'C2156', 'C1175', 'C1173', 'C1537',
       'C1929', 'C2394', 'C1316', 'C2255', 'C2237', 'C2539', 'C1472',
       'C2369'], dtype=object)

In [55]:
codes11 = ['C1312', 'C1068', 'C1005', 'C2156', 'C1175', 'C1173', 'C1537', 'C1929', 'C2394', 'C1316', 
           'C2255', 'C2237', 'C2539', 'C1472', 'C2369']  # 결측치, 미결측치 다 포함된 단지코드

for code in codes11:
    print(code, all_df[all_df['지역']=='충청남도'][all_df['공급유형']=='국민임대'][all_df['단지코드']==code]['10분내지하철수'].mean())

C1312 nan
C1068 nan
C1005 nan
C2156 nan
C1175 nan
C1173 0.0
C1537 0.0
C1929 0.0
C2394 0.0
C1316 0.0
C2255 0.0
C2237 0.0
C2539 0.0
C1472 nan
C2369 0.0


  """


In [56]:
group11['10분내지하철수'] = all_df[all_df['지역']=='충청남도'][all_df['공급유형']=='국민임대']['10분내지하철수'].mean()
group11

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
94,C1312,518.0,아파트,충청남도,국민임대,39.72,60.0,12.0,A,17460000,...,0.0,3.0,527.0,1.0,1.0,7.0,1.0,1.0,103.0,1.0
95,C1312,518.0,아파트,충청남도,국민임대,39.98,89.0,12.0,A,17460000,...,0.0,3.0,527.0,1.0,1.0,7.0,1.0,1.0,103.0,1.0
96,C1312,518.0,아파트,충청남도,국민임대,41.55,225.0,12.0,A,19954000,...,0.0,3.0,527.0,1.0,1.0,7.0,1.0,1.0,103.0,1.0
97,C1312,518.0,아파트,충청남도,국민임대,46.9,143.0,12.0,A,28687000,...,0.0,3.0,527.0,1.0,1.0,7.0,1.0,1.0,103.0,1.0
347,C1068,806.0,아파트,충청남도,국민임대,36.65,200.0,11.0,A,11234000,...,0.0,2.0,804.0,1.0,1.0,7.0,1.0,1.0,30.0,2.0
348,C1068,806.0,아파트,충청남도,국민임대,36.98,130.0,11.0,A,11234000,...,0.0,2.0,804.0,1.0,1.0,7.0,1.0,1.0,30.0,2.0
349,C1068,806.0,아파트,충청남도,국민임대,41.55,252.0,11.0,A,14981000,...,0.0,2.0,804.0,1.0,1.0,7.0,1.0,1.0,30.0,2.0
350,C1068,806.0,아파트,충청남도,국민임대,46.86,44.0,11.0,A,18726000,...,0.0,2.0,804.0,1.0,1.0,7.0,1.0,1.0,30.0,2.0
351,C1068,806.0,아파트,충청남도,국민임대,46.98,88.0,11.0,A,18726000,...,0.0,2.0,804.0,1.0,1.0,7.0,1.0,1.0,30.0,2.0
352,C1068,806.0,아파트,충청남도,국민임대,51.93,91.0,11.0,A,24343000,...,0.0,2.0,804.0,1.0,1.0,7.0,1.0,1.0,30.0,2.0


In [57]:
all_df.loc[all_df['지역']=='충청남도'][all_df['공급유형']=='영구임대']['단지코드'].unique() # group12(충청남도,영구임대)

  """Entry point for launching an IPython kernel.


array(['C1874', 'C1983', 'C2644', 'C1004', 'C1875', 'C2255', 'C2539',
       'C2177'], dtype=object)

In [58]:
codes12 = ['C1874', 'C1983', 'C2644', 'C1004', 'C1875', 'C2255', 'C2539', 'C2177']

for code in codes12:
    print(code, all_df[all_df['지역']=='충청남도'][all_df['공급유형']=='영구임대'][all_df['단지코드']==code]['10분내지하철수'].mean())

C1874 nan
C1983 nan
C2644 nan
C1004 nan
C1875 nan
C2255 0.0
C2539 0.0
C2177 nan


  after removing the cwd from sys.path.


In [59]:
group12['10분내지하철수'] = all_df[all_df['지역']=='충청남도'][all_df['공급유형']=='영구임대']['10분내지하철수'].mean()
group12

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
98,C1874,619.0,아파트,충청남도,영구임대,26.37,294.0,2.0,C,3141000,...,0.0,2.0,97.0,3.0,1.0,7.0,3.0,3.0,281.0,1.0
99,C1874,619.0,아파트,충청남도,영구임대,26.37,149.0,2.0,C,3141000,...,0.0,2.0,97.0,3.0,1.0,7.0,3.0,3.0,281.0,1.0
100,C1874,619.0,아파트,충청남도,영구임대,31.32,149.0,2.0,C,3731000,...,0.0,2.0,97.0,3.0,1.0,7.0,3.0,3.0,281.0,1.0
356,C1983,903.0,아파트,충청남도,영구임대,26.37,478.0,3.0,C,8083000,...,0.0,4.0,200.0,3.0,1.0,7.0,3.0,3.0,314.0,3.0
357,C1983,903.0,아파트,충청남도,영구임대,31.32,99.0,3.0,C,9670000,...,0.0,4.0,200.0,3.0,1.0,7.0,3.0,3.0,314.0,3.0
642,C2644,779.0,아파트,충청남도,영구임대,26.37,239.0,7.0,C,8684000,...,0.0,12.0,190.0,1.0,1.0,7.0,3.0,3.0,549.0,2.0
643,C2644,779.0,아파트,충청남도,영구임대,26.37,149.0,7.0,C,8684000,...,0.0,12.0,190.0,1.0,1.0,7.0,3.0,3.0,549.0,2.0
764,C1004,521.0,아파트,충청남도,영구임대,39.3,240.0,3.0,C,4682000,...,0.0,2.0,153.0,3.0,1.0,7.0,3.0,3.0,2.0,1.0
765,C1004,521.0,아파트,충청남도,영구임대,39.69,264.0,3.0,C,4728000,...,0.0,2.0,153.0,3.0,1.0,7.0,3.0,3.0,2.0,1.0
783,C1875,1003.0,아파트,충청남도,영구임대,26.37,596.0,5.0,C,3141000,...,0.0,8.0,192.0,3.0,1.0,7.0,3.0,3.0,282.0,3.0


In [60]:
all_df[all_df['지역']=='충청남도'][all_df['공급유형']=='임대상가']['단지코드'].unique()  # group13(충청남도,임대상가)

  """Entry point for launching an IPython kernel.


array(['C1874', 'C1983', 'C2644', 'C1004', 'C1875', 'C2177'], dtype=object)

In [61]:
codes13 = ['C1874', 'C1983', 'C2644', 'C1004', 'C1875', 'C2177']

for code in codes13:
    print(code, all_df[all_df['지역']=='충청남도'][all_df['공급유형']=='임대상가'][all_df['단지코드']==code]['10분내지하철수'].mean())

  after removing the cwd from sys.path.


C1874 nan
C1983 nan
C2644 nan
C1004 nan
C1875 nan
C2177 nan


In [62]:
group13['10분내지하철수'] = all_df[all_df['지역']=='충청남도']['10분내지하철수'].mean()
group13  # 단지 코드 평균이 NaN이므로 충남 지역 10분내지하철수의 평균으로 채움

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
101,C1874,619.0,상가,충청남도,임대상가,12.62,1.0,2.0,D,,...,0.0,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0
102,C1874,619.0,상가,충청남도,임대상가,17.4,1.0,2.0,D,,...,0.0,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0
103,C1874,619.0,상가,충청남도,임대상가,17.4,1.0,2.0,D,,...,0.0,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0
104,C1874,619.0,상가,충청남도,임대상가,22.89,1.0,2.0,D,,...,0.0,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0
105,C1874,619.0,상가,충청남도,임대상가,23.13,1.0,2.0,D,,...,0.0,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0
106,C1874,619.0,상가,충청남도,임대상가,23.13,1.0,2.0,D,,...,0.0,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0
107,C1874,619.0,상가,충청남도,임대상가,23.25,1.0,2.0,D,,...,0.0,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0
108,C1874,619.0,상가,충청남도,임대상가,27.75,1.0,2.0,D,,...,0.0,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0
109,C1874,619.0,상가,충청남도,임대상가,27.75,1.0,2.0,D,,...,0.0,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0
110,C1874,619.0,상가,충청남도,임대상가,27.75,1.0,2.0,D,,...,0.0,2.0,97.0,3.0,2.0,7.0,4.0,4.0,281.0,1.0


In [63]:
all_df[all_df['지역']=='충청남도'][all_df['공급유형']=='공공임대(50년)']['단지코드'].unique()  # group14(충청남도,공공임대(50년))

  """Entry point for launching an IPython kernel.


array(['C1983', 'C2216', 'C2644', 'C1318'], dtype=object)

In [64]:
codes14 = ['C1983', 'C2216', 'C2644', 'C1318']

for code in codes14:
    print(code, all_df[all_df['지역']=='충청남도'][all_df['공급유형']=='공공임대(50년)'][all_df['단지코드']==code]['10분내지하철수'].mean())

C1983 nan
C2216 nan
C2644 nan
C1318 nan


  after removing the cwd from sys.path.


In [65]:
all_df[all_df['지역']=='충청남도']['10분내지하철수'].mean()

0.0

In [66]:
group14['10분내지하철수'] = all_df[all_df['지역']=='충청남도']['10분내지하철수'].mean() 
group14  # 단지 코드 평균이 NaN이므로 충남 지역 10분내지하철수의 평균으로 채움

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
353,C1983,903.0,아파트,충청남도,공공임대(50년),36.0,40.0,3.0,A,10363000,...,0.0,4.0,200.0,3.0,1.0,7.0,2.0,1.0,314.0,3.0
354,C1983,903.0,아파트,충청남도,공공임대(50년),35.28,198.0,3.0,A,10060000,...,0.0,4.0,200.0,3.0,1.0,7.0,2.0,1.0,314.0,3.0
355,C1983,903.0,아파트,충청남도,공공임대(50년),35.28,78.0,3.0,A,10060000,...,0.0,4.0,200.0,3.0,1.0,7.0,2.0,1.0,314.0,3.0
620,C2216,390.0,아파트,충청남도,공공임대(50년),39.0,210.0,2.0,A,9401000,...,0.0,1.0,97.0,3.0,1.0,7.0,2.0,1.0,395.0,0.0
621,C2216,390.0,아파트,충청남도,공공임대(50년),39.39,180.0,2.0,A,9493000,...,0.0,1.0,97.0,3.0,1.0,7.0,2.0,1.0,395.0,0.0
640,C2644,779.0,아파트,충청남도,공공임대(50년),37.67,89.0,7.0,A,11879000,...,0.0,12.0,190.0,1.0,1.0,7.0,2.0,1.0,549.0,2.0
641,C2644,779.0,아파트,충청남도,공공임대(50년),37.67,298.0,7.0,A,11879000,...,0.0,12.0,190.0,1.0,1.0,7.0,2.0,1.0,549.0,2.0
3313,C1318,312.0,아파트,충청남도,공공임대(50년),39.51,135.0,1.0,A,14440000,...,0.0,11.0,226.0,3.0,1.0,7.0,2.0,1.0,105.0,0.0
3314,C1318,312.0,아파트,충청남도,공공임대(50년),39.72,18.0,1.0,A,14515000,...,0.0,11.0,226.0,3.0,1.0,7.0,2.0,1.0,105.0,0.0
3315,C1318,312.0,아파트,충청남도,공공임대(50년),49.99,159.0,1.0,A,18269000,...,0.0,11.0,226.0,3.0,1.0,7.0,2.0,1.0,105.0,0.0


In [67]:
group11_mean = group11['10분내지하철수'].mean()
group12_mean = group12['10분내지하철수'].mean()
group13_mean = group13['10분내지하철수'].mean()
group14_mean = group14['10분내지하철수'].mean()
all_df.loc[(all_df['지역']=='충청남도')&(all_df['공급유형']=='국민임대')&(all_df['10분내지하철수'].isnull()), '10분내지하철수']=group11_mean
all_df.loc[(all_df['지역']=='충청남도')&(all_df['공급유형']=='영구임대')&(all_df['10분내지하철수'].isnull()), '10분내지하철수']=group12_mean
all_df.loc[(all_df['지역']=='충청남도')&(all_df['공급유형']=='임대상가')&(all_df['10분내지하철수'].isnull()), '10분내지하철수']=group13_mean
all_df.loc[(all_df['지역']=='충청남도')&(all_df['공급유형']=='공공임대(50년)')&(all_df['10분내지하철수'].isnull()), '10분내지하철수']=group14_mean

In [68]:
all_df.loc[all_df['10분내지하철수'].isnull()]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
144,C1424,625.0,아파트,대전광역시,공공임대(50년),39.99,268.0,2.0,A,14247000.0,...,,7.0,517.0,1.0,1.0,2.0,2.0,1.0,137.0,1.0
145,C1424,625.0,아파트,대전광역시,공공임대(50년),49.95,119.0,2.0,A,20173000.0,...,,7.0,517.0,1.0,1.0,2.0,2.0,1.0,137.0,1.0
146,C1424,625.0,아파트,대전광역시,공공임대(50년),49.95,237.0,2.0,A,21203000.0,...,,7.0,517.0,1.0,1.0,2.0,2.0,1.0,137.0,1.0
147,C2100,880.0,아파트,대전광역시,국민임대,51.51,46.0,4.0,A,22307000.0,...,,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
148,C2100,880.0,아파트,대전광역시,국민임대,51.59,35.0,4.0,A,22307000.0,...,,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
149,C2100,880.0,아파트,대전광역시,국민임대,51.9,589.0,4.0,A,22307000.0,...,,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
150,C2100,880.0,아파트,대전광역시,국민임대,59.91,118.0,4.0,A,27885000.0,...,,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
151,C2100,880.0,아파트,대전광역시,국민임대,59.94,46.0,4.0,A,27885000.0,...,,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
152,C2100,880.0,아파트,대전광역시,국민임대,59.99,46.0,4.0,A,27885000.0,...,,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
171,C2520,970.0,아파트,대전광역시,공공임대(50년),39.69,960.0,2.0,A,13155000.0,...,,2.0,420.0,1.0,1.0,2.0,2.0,1.0,501.0,3.0


In [69]:
group2  #대전광역시

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
144,C1424,625.0,아파트,대전광역시,공공임대(50년),39.99,268.0,2.0,A,14247000.0,...,,7.0,517.0,1.0,1.0,2.0,2.0,1.0,137.0,1.0
145,C1424,625.0,아파트,대전광역시,공공임대(50년),49.95,119.0,2.0,A,20173000.0,...,,7.0,517.0,1.0,1.0,2.0,2.0,1.0,137.0,1.0
146,C1424,625.0,아파트,대전광역시,공공임대(50년),49.95,237.0,2.0,A,21203000.0,...,,7.0,517.0,1.0,1.0,2.0,2.0,1.0,137.0,1.0
147,C2100,880.0,아파트,대전광역시,국민임대,51.51,46.0,4.0,A,22307000.0,...,,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
148,C2100,880.0,아파트,대전광역시,국민임대,51.59,35.0,4.0,A,22307000.0,...,,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
149,C2100,880.0,아파트,대전광역시,국민임대,51.9,589.0,4.0,A,22307000.0,...,,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
150,C2100,880.0,아파트,대전광역시,국민임대,59.91,118.0,4.0,A,27885000.0,...,,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
151,C2100,880.0,아파트,대전광역시,국민임대,59.94,46.0,4.0,A,27885000.0,...,,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
152,C2100,880.0,아파트,대전광역시,국민임대,59.99,46.0,4.0,A,27885000.0,...,,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
171,C2520,970.0,아파트,대전광역시,공공임대(50년),39.69,960.0,2.0,A,13155000.0,...,,2.0,420.0,1.0,1.0,2.0,2.0,1.0,501.0,3.0


In [70]:
group2['공급유형'].unique()  # 10분내지하철수 값이 NaN인 대전광역시 지역 단지의 공급유형

array(['공공임대(50년)', '국민임대', '영구임대', '임대상가', '공공분양', '공공임대(10년)',
       '공공임대(분납)'], dtype=object)

In [71]:
grouped = group2.groupby(group2['공급유형'])
group21 = grouped.get_group('공공임대(50년)')
group22 = grouped.get_group('국민임대')
group23 = grouped.get_group('영구임대')
group24 = grouped.get_group('임대상가')
group25 = grouped.get_group('공공분양')
group26 = grouped.get_group('공공임대(10년)')
group27 = grouped.get_group('공공임대(분납)')

In [72]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공임대(50년)']['단지코드'].unique()  # group21

  """Entry point for launching an IPython kernel.


array(['C1424', 'C2520'], dtype=object)

In [73]:
codes21 = ['C1424', 'C2520']  # 단지코드별 10분내지하철수의 평균이 NaN이므로 대전 지역 평균으로 채움

for code in codes21:
    print(code, all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공임대(50년)'][all_df['단지코드']==code]['10분내지하철수'].mean(), end=' ')

C1424 nan C2520 nan 

  after removing the cwd from sys.path.


In [74]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공임대(50년)']['10분내지하철수'].mean()

  """Entry point for launching an IPython kernel.


nan

In [75]:
all_df[all_df['지역']=='대전광역시']['10분내지하철수'].mean()  # 해당 공급유형의 평균도 결측이 발생하였으므로 지역 평균으로 채움

0.8409090909090909

In [76]:
group21['10분내지하철수'] = math.trunc(all_df[all_df['지역']=='대전광역시']['10분내지하철수'].mean())
group21

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
144,C1424,625.0,아파트,대전광역시,공공임대(50년),39.99,268.0,2.0,A,14247000,...,0,7.0,517.0,1.0,1.0,2.0,2.0,1.0,137.0,1.0
145,C1424,625.0,아파트,대전광역시,공공임대(50년),49.95,119.0,2.0,A,20173000,...,0,7.0,517.0,1.0,1.0,2.0,2.0,1.0,137.0,1.0
146,C1424,625.0,아파트,대전광역시,공공임대(50년),49.95,237.0,2.0,A,21203000,...,0,7.0,517.0,1.0,1.0,2.0,2.0,1.0,137.0,1.0
171,C2520,970.0,아파트,대전광역시,공공임대(50년),39.69,960.0,2.0,A,13155000,...,0,2.0,420.0,1.0,1.0,2.0,2.0,1.0,501.0,3.0


In [77]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='국민임대']['단지코드'].unique()  # group22

  """Entry point for launching an IPython kernel.


array(['C1407', 'C1171', 'C2100', 'C2319', 'C1032', 'C1663', 'C2276',
       'C1155', 'C1693', 'C1177', 'C1406', 'C2314', 'C2583', 'C1017',
       'C1083', 'C1215', 'C1189'], dtype=object)

In [78]:
codes22 = ['C1407','C1171','C2100','C2319','C1032','C1663','C2276','C1155','C1693','C1177',
 'C1406','C2314','C2583','C1017','C1083','C1215','C1189']

for code in codes22:
    print(code, all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='국민임대'][all_df['단지코드']==code]['10분내지하철수'].mean())

  """


C1407 1.0
C1171 0.0
C2100 nan
C2319 1.0
C1032 1.0
C1663 1.0
C2276 0.0
C1155 1.0
C1693 0.0
C1177 1.0
C1406 0.0
C2314 0.0
C2583 nan
C1017 0.0
C1083 nan
C1215 1.0
C1189 0.0


In [79]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='국민임대']['10분내지하철수'].mean()

  """Entry point for launching an IPython kernel.


0.5102040816326531

In [80]:
group22['10분내지하철수'] = math.trunc(all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='국민임대']['10분내지하철수'].mean())
group22

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
147,C2100,880.0,아파트,대전광역시,국민임대,51.51,46.0,4.0,A,22307000,...,0,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
148,C2100,880.0,아파트,대전광역시,국민임대,51.59,35.0,4.0,A,22307000,...,0,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
149,C2100,880.0,아파트,대전광역시,국민임대,51.9,589.0,4.0,A,22307000,...,0,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
150,C2100,880.0,아파트,대전광역시,국민임대,59.91,118.0,4.0,A,27885000,...,0,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
151,C2100,880.0,아파트,대전광역시,국민임대,59.94,46.0,4.0,A,27885000,...,0,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
152,C2100,880.0,아파트,대전광역시,국민임대,59.99,46.0,4.0,A,27885000,...,0,5.0,736.0,3.0,1.0,2.0,1.0,1.0,358.0,2.0
2285,C2583,1106.0,아파트,대전광역시,국민임대,36.85,24.0,21.0,A,11397000,...,0,2.0,752.0,3.0,1.0,2.0,1.0,1.0,529.0,3.0
2286,C2583,1106.0,아파트,대전광역시,국민임대,36.85,70.0,21.0,A,11397000,...,0,2.0,752.0,3.0,1.0,2.0,1.0,1.0,529.0,3.0
2287,C2583,1106.0,아파트,대전광역시,국민임대,36.93,604.0,21.0,A,11397000,...,0,2.0,752.0,3.0,1.0,2.0,1.0,1.0,529.0,3.0
2288,C2583,1106.0,아파트,대전광역시,국민임대,36.93,24.0,21.0,A,11397000,...,0,2.0,752.0,3.0,1.0,2.0,1.0,1.0,529.0,3.0


In [81]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='영구임대']['단지코드'].unique()  # group23

  """Entry point for launching an IPython kernel.


array(['C2621', 'C1616', 'C1704', 'C2258', 'C2038', 'C1859', 'C2314',
       'C2583', 'C1006'], dtype=object)

In [82]:
codes23 = ['C2621', 'C1616', 'C1704', 'C2258', 'C2038', 'C1859', 'C2314', 'C2583', 'C1006']

for code in codes23:
    print(code, all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='영구임대'][all_df['단지코드']==code]['10분내지하철수'].mean())

C2621 1.0
C1616 nan
C1704 nan
C2258 nan
C2038 1.0
C1859 1.0
C2314 0.0
C2583 nan
C1006 2.0


  after removing the cwd from sys.path.


In [83]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='영구임대']['10분내지하철수'].mean()

  """Entry point for launching an IPython kernel.


1.1538461538461537

In [84]:
group23['10분내지하철수'] = round(all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='영구임대']['10분내지하철수'].mean())
group23

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
176,C1616,1507.0,아파트,대전광역시,영구임대,26.37,588.0,1.0,C,5787000,...,1,2.0,407.0,3.0,1.0,2.0,3.0,3.0,185.0,4.0
177,C1616,1507.0,아파트,대전광역시,영구임대,31.32,450.0,1.0,C,6873000,...,1,2.0,407.0,3.0,1.0,2.0,3.0,3.0,185.0,4.0
178,C1616,1507.0,아파트,대전광역시,영구임대,40.32,450.0,1.0,E,54040000,...,1,2.0,407.0,3.0,1.0,2.0,3.0,5.0,185.0,4.0
198,C1704,639.0,아파트,대전광역시,영구임대,39.3,204.0,34.0,C,8625000,...,1,7.0,162.0,3.0,1.0,2.0,3.0,3.0,222.0,1.0
199,C1704,639.0,아파트,대전광역시,영구임대,39.69,420.0,34.0,C,8710000,...,1,7.0,162.0,3.0,1.0,2.0,3.0,3.0,222.0,1.0
215,C2258,965.0,아파트,대전광역시,영구임대,26.37,588.0,13.0,C,5787000,...,1,3.0,287.0,3.0,1.0,2.0,3.0,3.0,408.0,3.0
216,C2258,965.0,아파트,대전광역시,영구임대,31.32,180.0,13.0,C,6873000,...,1,3.0,287.0,3.0,1.0,2.0,3.0,3.0,408.0,3.0
217,C2258,965.0,아파트,대전광역시,영구임대,40.32,180.0,13.0,C,8848000,...,1,3.0,287.0,3.0,1.0,2.0,3.0,3.0,408.0,3.0
2291,C2583,1106.0,아파트,대전광역시,영구임대,21.97,60.0,21.0,I,7175000,...,1,2.0,752.0,3.0,1.0,2.0,3.0,9.0,529.0,3.0
2292,C2583,1106.0,아파트,대전광역시,영구임대,26.9,60.0,21.0,I,8707000,...,1,2.0,752.0,3.0,1.0,2.0,3.0,9.0,529.0,3.0


In [85]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='임대상가']['단지코드'].unique()  # group24

  """Entry point for launching an IPython kernel.


array(['C2621', 'C1616', 'C1704', 'C2258', 'C2038', 'C1859', 'C1006'],
      dtype=object)

In [86]:
codes24 = ['C2621', 'C1616', 'C1704', 'C2258', 'C2038', 'C1859', 'C1006']

for code in codes24:
    print(code, all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='임대상가'][all_df['단지코드']==code]['10분내지하철수'].mean())

  after removing the cwd from sys.path.


C2621 1.0
C1616 nan
C1704 nan
C2258 nan
C2038 1.0
C1859 1.0
C1006 2.0


In [87]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='임대상가']['10분내지하철수'].mean()

  """Entry point for launching an IPython kernel.


1.2439024390243902

In [88]:
group24['10분내지하철수'] = round(all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='임대상가']['10분내지하철수'].mean())
group24

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
179,C1616,1507.0,상가,대전광역시,임대상가,38.0,1.0,1.0,D,,...,1,2.0,407.0,3.0,2.0,2.0,4.0,4.0,185.0,4.0
180,C1616,1507.0,상가,대전광역시,임대상가,38.0,1.0,1.0,D,,...,1,2.0,407.0,3.0,2.0,2.0,4.0,4.0,185.0,4.0
181,C1616,1507.0,상가,대전광역시,임대상가,37.26,1.0,1.0,D,,...,1,2.0,407.0,3.0,2.0,2.0,4.0,4.0,185.0,4.0
182,C1616,1507.0,상가,대전광역시,임대상가,37.41,1.0,1.0,D,,...,1,2.0,407.0,3.0,2.0,2.0,4.0,4.0,185.0,4.0
183,C1616,1507.0,상가,대전광역시,임대상가,37.41,1.0,1.0,D,,...,1,2.0,407.0,3.0,2.0,2.0,4.0,4.0,185.0,4.0
184,C1616,1507.0,상가,대전광역시,임대상가,37.41,1.0,1.0,D,,...,1,2.0,407.0,3.0,2.0,2.0,4.0,4.0,185.0,4.0
185,C1616,1507.0,상가,대전광역시,임대상가,37.49,1.0,1.0,D,,...,1,2.0,407.0,3.0,2.0,2.0,4.0,4.0,185.0,4.0
186,C1616,1507.0,상가,대전광역시,임대상가,37.95,1.0,1.0,D,,...,1,2.0,407.0,3.0,2.0,2.0,4.0,4.0,185.0,4.0
187,C1616,1507.0,상가,대전광역시,임대상가,37.95,1.0,1.0,D,,...,1,2.0,407.0,3.0,2.0,2.0,4.0,4.0,185.0,4.0
188,C1616,1507.0,상가,대전광역시,임대상가,38.04,1.0,1.0,D,,...,1,2.0,407.0,3.0,2.0,2.0,4.0,4.0,185.0,4.0


In [89]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공분양']['단지코드'].unique()  # group25

  """Entry point for launching an IPython kernel.


array(['C1350'], dtype=object)

In [90]:
print('C1350', all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공분양'][all_df['단지코드']=='C1350']['10분내지하철수'].mean())

C1350 nan


  """Entry point for launching an IPython kernel.


In [91]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공분양']['10분내지하철수'].mean()

  """Entry point for launching an IPython kernel.


nan

In [92]:
all_df[all_df['지역']=='대전광역시']['10분내지하철수'].mean()

0.8409090909090909

In [93]:
group25['10분내지하철수'] = math.trunc(all_df[all_df['지역']=='대전광역시']['10분내지하철수'].mean())
group25

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
2331,C1350,1401.0,아파트,대전광역시,공공분양,74.94,317.0,2.0,D,,...,0,6.0,1636.0,3.0,1.0,2.0,8.0,4.0,119.0,4.0
2332,C1350,1401.0,아파트,대전광역시,공공분양,74.94,137.0,2.0,D,,...,0,6.0,1636.0,3.0,1.0,2.0,8.0,4.0,119.0,4.0
2333,C1350,1401.0,아파트,대전광역시,공공분양,74.94,22.0,2.0,D,,...,0,6.0,1636.0,3.0,1.0,2.0,8.0,4.0,119.0,4.0
2334,C1350,1401.0,아파트,대전광역시,공공분양,84.94,164.0,2.0,D,,...,0,6.0,1636.0,3.0,1.0,2.0,8.0,4.0,119.0,4.0
2335,C1350,1401.0,아파트,대전광역시,공공분양,84.94,19.0,2.0,D,,...,0,6.0,1636.0,3.0,1.0,2.0,8.0,4.0,119.0,4.0
2336,C1350,1401.0,아파트,대전광역시,공공분양,84.96,26.0,2.0,D,,...,0,6.0,1636.0,3.0,1.0,2.0,8.0,4.0,119.0,4.0
2337,C1350,1401.0,아파트,대전광역시,공공분양,84.97,26.0,2.0,D,,...,0,6.0,1636.0,3.0,1.0,2.0,8.0,4.0,119.0,4.0


In [94]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공임대(10년)']['단지코드'].unique()  # group26

  """Entry point for launching an IPython kernel.


array(['C1350', 'C2430'], dtype=object)

In [95]:
codes26 = ['C1350', 'C2430']

for code in codes26:
    print(code, all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공임대(10년)'][all_df['단지코드']==code]['10분내지하철수'].mean(), end=' ')

C1350 nan C2430 0.0 

  after removing the cwd from sys.path.


In [96]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공임대(10년)']['10분내지하철수'].mean()

  """Entry point for launching an IPython kernel.


0.0

In [97]:
group26['10분내지하철수'] = all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공임대(10년)']['10분내지하철수'].mean()
group26

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
2338,C1350,1401.0,아파트,대전광역시,공공임대(10년),51.99,106.0,2.0,A,28013000,...,0.0,6.0,1636.0,3.0,1.0,2.0,5.0,1.0,119.0,4.0
2339,C1350,1401.0,아파트,대전광역시,공공임대(10년),59.91,13.0,2.0,A,37474000,...,0.0,6.0,1636.0,3.0,1.0,2.0,5.0,1.0,119.0,4.0
2340,C1350,1401.0,아파트,대전광역시,공공임대(10년),59.92,223.0,2.0,A,37385000,...,0.0,6.0,1636.0,3.0,1.0,2.0,5.0,1.0,119.0,4.0


In [98]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공임대(분납)']['단지코드'].unique()  # group27

  """Entry point for launching an IPython kernel.


array(['C1350', 'C2430'], dtype=object)

In [99]:
codes27 = ['C1350', 'C2430']

for code in codes27:
    print(code, all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공임대(분납)'][all_df['단지코드']==code]['10분내지하철수'].mean(), end=' ')

C1350 nan C2430 0.0 

  after removing the cwd from sys.path.


In [100]:
all_df[all_df['지역']=='대전광역시'][all_df['단지코드']=='C1350']['10분내지하철수'].mean()

  """Entry point for launching an IPython kernel.


nan

In [101]:
all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공임대(분납)']['10분내지하철수'].mean()

  """Entry point for launching an IPython kernel.


0.0

In [102]:
group27['10분내지하철수'] = all_df[all_df['지역']=='대전광역시'][all_df['공급유형']=='공공임대(분납)']['10분내지하철수'].mean()
group27

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
2341,C1350,1401.0,아파트,대전광역시,공공임대(분납),51.99,146.0,2.0,A,54612000,...,0.0,6.0,1636.0,3.0,1.0,2.0,6.0,1.0,119.0,4.0
2342,C1350,1401.0,아파트,대전광역시,공공임대(분납),59.91,32.0,2.0,A,63585000,...,0.0,6.0,1636.0,3.0,1.0,2.0,6.0,1.0,119.0,4.0
2343,C1350,1401.0,아파트,대전광역시,공공임대(분납),59.92,170.0,2.0,A,63501000,...,0.0,6.0,1636.0,3.0,1.0,2.0,6.0,1.0,119.0,4.0


In [103]:
group21_mean = group21['10분내지하철수'].mean()
group22_mean = group22['10분내지하철수'].mean()
group23_mean = group23['10분내지하철수'].mean()
group24_mean = group24['10분내지하철수'].mean()
group25_mean = group25['10분내지하철수'].mean()
group26_mean = group26['10분내지하철수'].mean()
group27_mean = group27['10분내지하철수'].mean()
all_df.loc[(all_df['지역']=='대전광역시')&(all_df['공급유형']=='공공임대(50년)')&(all_df['10분내지하철수'].isnull()), '10분내지하철수']=group21_mean
all_df.loc[(all_df['지역']=='대전광역시')&(all_df['공급유형']=='국민임대')&(all_df['10분내지하철수'].isnull()), '10분내지하철수']=group22_mean
all_df.loc[(all_df['지역']=='대전광역시')&(all_df['공급유형']=='영구임대')&(all_df['10분내지하철수'].isnull()), '10분내지하철수']=group23_mean
all_df.loc[(all_df['지역']=='대전광역시')&(all_df['공급유형']=='임대상가')&(all_df['10분내지하철수'].isnull()), '10분내지하철수']=group24_mean
all_df.loc[(all_df['지역']=='대전광역시')&(all_df['공급유형']=='공공분양')&(all_df['10분내지하철수'].isnull()), '10분내지하철수']=group25_mean
all_df.loc[(all_df['지역']=='대전광역시')&(all_df['공급유형']=='공공임대(10년)')&(all_df['10분내지하철수'].isnull()), '10분내지하철수']=group26_mean
all_df.loc[(all_df['지역']=='대전광역시')&(all_df['공급유형']=='공공임대(분납)')&(all_df['10분내지하철수'].isnull()), '10분내지하철수']=group27_mean

In [104]:
all_df[all_df['10분내지하철수'].isnull()]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
2315,N2431,1047.0,아파트,경상남도,공공임대(10년),74.97,80.0,15.0,A,46000000.0,...,,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2316,N2431,1047.0,아파트,경상남도,공공임대(10년),84.95,124.0,15.0,A,57000000.0,...,,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2317,N2431,1047.0,아파트,경상남도,공공임대(10년),84.96,289.0,15.0,A,57000000.0,...,,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2318,N2431,1047.0,아파트,경상남도,공공임대(10년),84.98,82.0,15.0,A,57000000.0,...,,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2896,,,,,,,,,,,...,,,,,,,,,,
2897,,,,,,,,,,,...,,,,,,,,,,
2898,,,,,,,,,,,...,,,,,,,,,,
2899,,,,,,,,,,,...,,,,,,,,,,
2900,,,,,,,,,,,...,,,,,,,,,,
2901,,,,,,,,,,,...,,,,,,,,,,


In [105]:
group3 # 경상남도

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
2315,N2431,1047.0,아파트,경상남도,공공임대(10년),74.97,80.0,15.0,A,46000000,...,,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2316,N2431,1047.0,아파트,경상남도,공공임대(10년),84.95,124.0,15.0,A,57000000,...,,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2317,N2431,1047.0,아파트,경상남도,공공임대(10년),84.96,289.0,15.0,A,57000000,...,,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2318,N2431,1047.0,아파트,경상남도,공공임대(10년),84.98,82.0,15.0,A,57000000,...,,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0


In [106]:
group3['공급유형'].unique()

array(['공공임대(10년)'], dtype=object)

In [107]:
all_df[all_df['지역']=='경상남도'][all_df['공급유형']=='공공임대(10년)']

  """Entry point for launching an IPython kernel.


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
2172,C1788,376.0,아파트,경상남도,공공임대(10년),51.59,116.0,28.0,A,29000000,...,0.0,3.0,380.0,3.0,1.0,1.0,5.0,1.0,250.0,0.0
2173,C1788,376.0,아파트,경상남도,공공임대(10년),59.97,260.0,28.0,A,36000000,...,0.0,3.0,380.0,3.0,1.0,1.0,5.0,1.0,250.0,0.0
2222,C2405,600.0,아파트,경상남도,공공임대(10년),75.84,48.0,22.0,A,36019000,...,0.0,8.0,728.0,2.0,1.0,1.0,5.0,1.0,460.0,1.0
2223,C2405,600.0,아파트,경상남도,공공임대(10년),75.99,54.0,22.0,A,36113000,...,0.0,8.0,728.0,2.0,1.0,1.0,5.0,1.0,460.0,1.0
2224,C2405,600.0,아파트,경상남도,공공임대(10년),84.95,182.0,22.0,A,44868000,...,0.0,8.0,728.0,2.0,1.0,1.0,5.0,1.0,460.0,1.0
2315,N2431,1047.0,아파트,경상남도,공공임대(10년),74.97,80.0,15.0,A,46000000,...,,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2316,N2431,1047.0,아파트,경상남도,공공임대(10년),84.95,124.0,15.0,A,57000000,...,,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2317,N2431,1047.0,아파트,경상남도,공공임대(10년),84.96,289.0,15.0,A,57000000,...,,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2318,N2431,1047.0,아파트,경상남도,공공임대(10년),84.98,82.0,15.0,A,57000000,...,,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2543,C1941,404.0,아파트,경상남도,공공임대(10년),84.94,64.0,19.0,A,47288000,...,0.0,3.0,490.0,3.0,1.0,1.0,5.0,1.0,304.0,0.0


In [108]:
all_df[all_df['지역']=='경상남도'][all_df['단지코드']=='N2431']['10분내지하철수'].mean()

  """Entry point for launching an IPython kernel.


0.0

In [109]:
group3['10분내지하철수'] = all_df[all_df['지역']=='경상남도'][all_df['공급유형']=='공공임대(10년)']['10분내지하철수'].mean()
group3

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
2315,N2431,1047.0,아파트,경상남도,공공임대(10년),74.97,80.0,15.0,A,46000000,...,0.0,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2316,N2431,1047.0,아파트,경상남도,공공임대(10년),84.95,124.0,15.0,A,57000000,...,0.0,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2317,N2431,1047.0,아파트,경상남도,공공임대(10년),84.96,289.0,15.0,A,57000000,...,0.0,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0
2318,N2431,1047.0,아파트,경상남도,공공임대(10년),84.98,82.0,15.0,A,57000000,...,0.0,3.983003,1066.0,3.0,1.0,1.0,5.0,1.0,567.0,3.0


In [110]:
group3_mean = group3['10분내지하철수'].mean()
all_df.loc[(all_df['지역']=='경상남도')&(all_df['공급유형']=='공공임대(10년)')&(all_df['10분내지하철수'].isnull()), '10분내지하철수']=group3_mean

In [111]:
all_df[all_df['10분내지하철수'].isnull()]

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,...,10분내지하철수,10분내버스정류장수,단지내주차면수,단지코드_Type,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수
2896,,,,,,,,,,,...,,,,,,,,,,
2897,,,,,,,,,,,...,,,,,,,,,,
2898,,,,,,,,,,,...,,,,,,,,,,
2899,,,,,,,,,,,...,,,,,,,,,,
2900,,,,,,,,,,,...,,,,,,,,,,
2901,,,,,,,,,,,...,,,,,,,,,,
2902,,,,,,,,,,,...,,,,,,,,,,
2903,,,,,,,,,,,...,,,,,,,,,,
2904,,,,,,,,,,,...,,,,,,,,,,
2905,,,,,,,,,,,...,,,,,,,,,,


In [112]:
all_df['10분내지하철수'].unique()

array([ 0.,  1.,  2.,  3., nan])

In [113]:
all_df.isnull().sum()

단지코드           56
총세대수           56
임대건물구분         56
지역             56
공급유형           56
전용면적           56
전용면적별세대수       56
공가수            56
자격유형           56
임대보증금         805
임대료           805
10분내지하철수       56
10분내버스정류장수     56
단지내주차면수        56
단지코드_Type      56
임대건물구분_lbl     56
지역_lbl         56
공급유형_lbl       56
자격유형_lbl       56
단지코드_lbl       56
qcut_총세대수      56
dtype: int64

In [114]:
train_df = all_df.iloc[0:2896,:]
test_df = all_df.iloc[2896:,:]

train_df.shape, test_df.shape

((2896, 21), (1120, 21))

In [115]:
train_df = pd.concat([train_df, train[['등록차량수', 'log_등록차량수']] ], axis=1)
train_df.shape

(2952, 23)

### 지하철 결측치 처리 후, csv파일을 만들기

In [116]:
train_df.to_csv("./data/train_df_errno2.csv", index=False)
test_df.to_csv("./data/test_df2.csv", index=False)

# 3. rent_fee_missing_values_preprocessing

### 임대료 및 임대보증금 결측치 처리

In [117]:
train = pd.read_csv('./Data/train_df_errno2.csv')
test = pd.read_csv('./Data/test_df2.csv')

train.shape, test.shape

((2952, 23), (1120, 21))

In [118]:
train.isnull().sum()

단지코드           56
총세대수           56
임대건물구분         56
지역             56
공급유형           56
전용면적           56
전용면적별세대수       56
공가수            56
자격유형           56
임대보증금         625
임대료           625
10분내지하철수       56
10분내버스정류장수     56
단지내주차면수        56
단지코드_Type      56
임대건물구분_lbl     56
지역_lbl         56
공급유형_lbl       56
자격유형_lbl       56
단지코드_lbl       56
qcut_총세대수      56
등록차량수           0
log_등록차량수       0
dtype: int64

In [119]:
test.isnull().sum()

단지코드           56
총세대수           56
임대건물구분         56
지역             56
공급유형           56
전용면적           56
전용면적별세대수       56
공가수            56
자격유형           56
임대보증금         236
임대료           236
10분내지하철수       56
10분내버스정류장수     56
단지내주차면수        56
단지코드_Type      56
임대건물구분_lbl     56
지역_lbl         56
공급유형_lbl       56
자격유형_lbl       56
단지코드_lbl       56
qcut_총세대수      56
dtype: int64

In [120]:
all_df = pd.concat([train, test], join='inner', ignore_index=True)
all_df.isnull().sum()

단지코드          112
총세대수          112
임대건물구분        112
지역            112
공급유형          112
전용면적          112
전용면적별세대수      112
공가수           112
자격유형          112
임대보증금         861
임대료           861
10분내지하철수      112
10분내버스정류장수    112
단지내주차면수       112
단지코드_Type     112
임대건물구분_lbl    112
지역_lbl        112
공급유형_lbl      112
자격유형_lbl      112
단지코드_lbl      112
qcut_총세대수     112
dtype: int64

In [121]:
all_df.loc[all_df['임대보증금'].isna(), '임대보증금'] = 0
all_df.loc[all_df['임대보증금'] == '-', '임대보증금'] = 0
all_df['임대보증금'] = all_df['임대보증금'].astype(float)


all_df.loc[all_df['임대료'].isna(), '임대료'] = 0
all_df.loc[all_df['임대료'] == '-', '임대료'] = 0
all_df['임대료'] = all_df['임대료'].astype(float)

In [122]:
all_df.isna().sum()

단지코드          112
총세대수          112
임대건물구분        112
지역            112
공급유형          112
전용면적          112
전용면적별세대수      112
공가수           112
자격유형          112
임대보증금           0
임대료             0
10분내지하철수      112
10분내버스정류장수    112
단지내주차면수       112
단지코드_Type     112
임대건물구분_lbl    112
지역_lbl        112
공급유형_lbl      112
자격유형_lbl      112
단지코드_lbl      112
qcut_총세대수     112
dtype: int64

In [123]:
train_df = all_df.iloc[0:2896,:]
test_df = all_df.iloc[2896:,:]

train_df.shape, test_df.shape

((2896, 21), (1176, 21))

In [124]:
train_df = pd.concat([train_df, train[['등록차량수', 'log_등록차량수']]], axis=1)
train_df.shape

(2952, 23)

### 임대보증금 결측치 처리 후 최종 파일 출력

In [125]:
train_df.to_csv("./data/train_df_errno_final.csv", index=False)
test_df.to_csv("./data/test_df_final.csv", index=False)

# 4. model(create, submit)

In [126]:
train = pd.read_csv('./Data/train_df_errno_final.csv')
test = pd.read_csv('./Data/test_df_final.csv')

train.shape, test.shape

((2952, 23), (1176, 21))

In [127]:
all_df = pd.concat([train, test], ignore_index=True)
all_df.shape

(4128, 23)

### Feature 추가(교통편의성 / 총세대수주차면수)

In [128]:
all_df['교통편의성'] = all_df['10분내지하철수'] + all_df['10분내버스정류장수']
all_df['총세대수주차면수'] =  all_df['단지내주차면수'] / all_df['총세대수']

In [129]:
train = all_df.iloc[0:2896,:]
test = all_df.iloc[2896:,:]

test.drop(['등록차량수', 'log_등록차량수'], inplace=True, axis=1)

train.shape, test.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


((2896, 25), (1232, 23))

### cross_val_score 이용 사전 MAE 점수확인

In [130]:
sel = ['총세대수', '전용면적', '공가수', '단지내주차면수','qcut_총세대수', '자격유형_lbl', 
       '전용면적별세대수', '10분내버스정류장수', '10분내지하철수', '임대건물구분_lbl', 
       '공급유형_lbl', '지역_lbl', '단지코드_lbl','단지코드_Type','교통편의성', 
       '총세대수주차면수','임대료','임대보증금']

X = train[sel]
y = train['log_등록차량수']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [131]:
model = CatBoostRegressor(loss_function='MAE',
                          n_estimators=500, 
                          learning_rate=0.1, 
                          random_state=42)

model.fit(X_train, y_train)

print('Train :', model.score(X_train,y_train))
print('Test :', model.score(X_test,y_test))

0:	learn: 0.6494940	total: 151ms	remaining: 1m 15s
1:	learn: 0.5924292	total: 161ms	remaining: 40s
2:	learn: 0.5495952	total: 170ms	remaining: 28.1s
3:	learn: 0.5078658	total: 174ms	remaining: 21.6s
4:	learn: 0.4720927	total: 179ms	remaining: 17.8s
5:	learn: 0.4432661	total: 193ms	remaining: 15.9s
6:	learn: 0.4144279	total: 201ms	remaining: 14.2s
7:	learn: 0.3910125	total: 205ms	remaining: 12.6s
8:	learn: 0.3680969	total: 211ms	remaining: 11.5s
9:	learn: 0.3480728	total: 220ms	remaining: 10.8s
10:	learn: 0.3311284	total: 225ms	remaining: 10s
11:	learn: 0.3166579	total: 235ms	remaining: 9.56s
12:	learn: 0.3034582	total: 240ms	remaining: 9s
13:	learn: 0.2893081	total: 249ms	remaining: 8.64s
14:	learn: 0.2777542	total: 255ms	remaining: 8.26s
15:	learn: 0.2669274	total: 264ms	remaining: 7.99s
16:	learn: 0.2575592	total: 268ms	remaining: 7.62s
17:	learn: 0.2472059	total: 275ms	remaining: 7.37s
18:	learn: 0.2388910	total: 283ms	remaining: 7.16s
19:	learn: 0.2313684	total: 288ms	remaining: 6.

191:	learn: 0.0965668	total: 1.17s	remaining: 1.87s
192:	learn: 0.0963220	total: 1.17s	remaining: 1.86s
193:	learn: 0.0956034	total: 1.18s	remaining: 1.85s
194:	learn: 0.0954639	total: 1.18s	remaining: 1.84s
195:	learn: 0.0953084	total: 1.18s	remaining: 1.83s
196:	learn: 0.0947413	total: 1.19s	remaining: 1.82s
197:	learn: 0.0945868	total: 1.19s	remaining: 1.82s
198:	learn: 0.0945460	total: 1.19s	remaining: 1.81s
199:	learn: 0.0943950	total: 1.2s	remaining: 1.8s
200:	learn: 0.0941980	total: 1.2s	remaining: 1.79s
201:	learn: 0.0940191	total: 1.21s	remaining: 1.78s
202:	learn: 0.0939116	total: 1.21s	remaining: 1.77s
203:	learn: 0.0937626	total: 1.21s	remaining: 1.76s
204:	learn: 0.0936632	total: 1.22s	remaining: 1.75s
205:	learn: 0.0933886	total: 1.22s	remaining: 1.74s
206:	learn: 0.0933457	total: 1.23s	remaining: 1.73s
207:	learn: 0.0932408	total: 1.23s	remaining: 1.72s
208:	learn: 0.0929911	total: 1.23s	remaining: 1.72s
209:	learn: 0.0928201	total: 1.24s	remaining: 1.71s
210:	learn: 0.0

354:	learn: 0.0756471	total: 1.83s	remaining: 748ms
355:	learn: 0.0756035	total: 1.83s	remaining: 742ms
356:	learn: 0.0755547	total: 1.84s	remaining: 737ms
357:	learn: 0.0755202	total: 1.84s	remaining: 731ms
358:	learn: 0.0754628	total: 1.85s	remaining: 725ms
359:	learn: 0.0753981	total: 1.85s	remaining: 721ms
360:	learn: 0.0753639	total: 1.86s	remaining: 715ms
361:	learn: 0.0752413	total: 1.86s	remaining: 709ms
362:	learn: 0.0749713	total: 1.86s	remaining: 703ms
363:	learn: 0.0749622	total: 1.87s	remaining: 698ms
364:	learn: 0.0749405	total: 1.87s	remaining: 693ms
365:	learn: 0.0749179	total: 1.88s	remaining: 687ms
366:	learn: 0.0747926	total: 1.88s	remaining: 682ms
367:	learn: 0.0747166	total: 1.88s	remaining: 676ms
368:	learn: 0.0747047	total: 1.89s	remaining: 670ms
369:	learn: 0.0746665	total: 1.89s	remaining: 665ms
370:	learn: 0.0745422	total: 1.9s	remaining: 660ms
371:	learn: 0.0745250	total: 1.9s	remaining: 654ms
372:	learn: 0.0744819	total: 1.9s	remaining: 648ms
373:	learn: 0.0

In [132]:
now_time = time.time()

model = CatBoostRegressor(loss_function='MAE',
                          n_estimators=500, 
                          learning_rate=0.1, 
                          random_state=42)

model.fit(X_train, y_train)

score = cross_val_score(model, X_train, y_train,
                        cv=5, scoring="neg_mean_absolute_error") # neg_mean_squared_error
m_score = np.abs(score.mean())

print("CatBoostRegressor Score : {}".format(m_score))  # 점수

pro_time = time.time() - now_time
print('Time :', pro_time)  # 걸린 시간

0:	learn: 0.6494940	total: 3.26ms	remaining: 1.63s
1:	learn: 0.5924292	total: 7.63ms	remaining: 1.9s
2:	learn: 0.5495952	total: 11.4ms	remaining: 1.88s
3:	learn: 0.5078658	total: 15.1ms	remaining: 1.87s
4:	learn: 0.4720927	total: 19ms	remaining: 1.88s
5:	learn: 0.4432661	total: 23.5ms	remaining: 1.94s
6:	learn: 0.4144279	total: 27.5ms	remaining: 1.94s
7:	learn: 0.3910125	total: 31.3ms	remaining: 1.92s
8:	learn: 0.3680969	total: 35.1ms	remaining: 1.91s
9:	learn: 0.3480728	total: 38ms	remaining: 1.86s
10:	learn: 0.3311284	total: 41.1ms	remaining: 1.83s
11:	learn: 0.3166579	total: 44.3ms	remaining: 1.8s
12:	learn: 0.3034582	total: 47.3ms	remaining: 1.77s
13:	learn: 0.2893081	total: 50.9ms	remaining: 1.76s
14:	learn: 0.2777542	total: 54.8ms	remaining: 1.77s
15:	learn: 0.2669274	total: 58.8ms	remaining: 1.78s
16:	learn: 0.2575592	total: 61.4ms	remaining: 1.74s
17:	learn: 0.2472059	total: 64.3ms	remaining: 1.72s
18:	learn: 0.2388910	total: 67.2ms	remaining: 1.7s
19:	learn: 0.2313684	total: 7

169:	learn: 0.1001988	total: 665ms	remaining: 1.29s
170:	learn: 0.1000251	total: 669ms	remaining: 1.29s
171:	learn: 0.0998878	total: 673ms	remaining: 1.28s
172:	learn: 0.0997036	total: 677ms	remaining: 1.28s
173:	learn: 0.0995328	total: 680ms	remaining: 1.27s
174:	learn: 0.0992246	total: 684ms	remaining: 1.27s
175:	learn: 0.0990672	total: 688ms	remaining: 1.27s
176:	learn: 0.0987530	total: 691ms	remaining: 1.26s
177:	learn: 0.0986362	total: 695ms	remaining: 1.26s
178:	learn: 0.0985599	total: 698ms	remaining: 1.25s
179:	learn: 0.0985150	total: 702ms	remaining: 1.25s
180:	learn: 0.0984415	total: 705ms	remaining: 1.24s
181:	learn: 0.0983417	total: 708ms	remaining: 1.24s
182:	learn: 0.0982106	total: 712ms	remaining: 1.23s
183:	learn: 0.0980699	total: 716ms	remaining: 1.23s
184:	learn: 0.0977785	total: 719ms	remaining: 1.22s
185:	learn: 0.0976844	total: 722ms	remaining: 1.22s
186:	learn: 0.0974402	total: 726ms	remaining: 1.22s
187:	learn: 0.0973201	total: 730ms	remaining: 1.21s
188:	learn: 

348:	learn: 0.0767297	total: 1.33s	remaining: 575ms
349:	learn: 0.0767094	total: 1.33s	remaining: 571ms
350:	learn: 0.0763732	total: 1.34s	remaining: 567ms
351:	learn: 0.0761759	total: 1.34s	remaining: 564ms
352:	learn: 0.0759480	total: 1.34s	remaining: 560ms
353:	learn: 0.0757258	total: 1.35s	remaining: 556ms
354:	learn: 0.0756471	total: 1.35s	remaining: 552ms
355:	learn: 0.0756035	total: 1.35s	remaining: 548ms
356:	learn: 0.0755547	total: 1.36s	remaining: 544ms
357:	learn: 0.0755202	total: 1.36s	remaining: 540ms
358:	learn: 0.0754628	total: 1.36s	remaining: 536ms
359:	learn: 0.0753981	total: 1.37s	remaining: 533ms
360:	learn: 0.0753639	total: 1.37s	remaining: 529ms
361:	learn: 0.0752413	total: 1.38s	remaining: 525ms
362:	learn: 0.0749713	total: 1.38s	remaining: 521ms
363:	learn: 0.0749622	total: 1.38s	remaining: 517ms
364:	learn: 0.0749405	total: 1.39s	remaining: 513ms
365:	learn: 0.0749179	total: 1.39s	remaining: 509ms
366:	learn: 0.0747926	total: 1.39s	remaining: 504ms
367:	learn: 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


29:	learn: 0.1867693	total: 121ms	remaining: 1.89s
30:	learn: 0.1835731	total: 124ms	remaining: 1.88s
31:	learn: 0.1809910	total: 128ms	remaining: 1.88s
32:	learn: 0.1791033	total: 132ms	remaining: 1.86s
33:	learn: 0.1772571	total: 135ms	remaining: 1.85s
34:	learn: 0.1754104	total: 138ms	remaining: 1.83s
35:	learn: 0.1727282	total: 141ms	remaining: 1.81s
36:	learn: 0.1702813	total: 143ms	remaining: 1.79s
37:	learn: 0.1689588	total: 147ms	remaining: 1.78s
38:	learn: 0.1676415	total: 150ms	remaining: 1.77s
39:	learn: 0.1645315	total: 153ms	remaining: 1.76s
40:	learn: 0.1631222	total: 156ms	remaining: 1.75s
41:	learn: 0.1614683	total: 159ms	remaining: 1.74s
42:	learn: 0.1601159	total: 163ms	remaining: 1.74s
43:	learn: 0.1587899	total: 166ms	remaining: 1.72s
44:	learn: 0.1579620	total: 168ms	remaining: 1.7s
45:	learn: 0.1572593	total: 171ms	remaining: 1.68s
46:	learn: 0.1555826	total: 173ms	remaining: 1.67s
47:	learn: 0.1533798	total: 176ms	remaining: 1.66s
48:	learn: 0.1521925	total: 179m

231:	learn: 0.0802603	total: 774ms	remaining: 894ms
232:	learn: 0.0800413	total: 777ms	remaining: 891ms
233:	learn: 0.0797497	total: 781ms	remaining: 887ms
234:	learn: 0.0796232	total: 784ms	remaining: 884ms
235:	learn: 0.0795809	total: 787ms	remaining: 880ms
236:	learn: 0.0795163	total: 790ms	remaining: 877ms
237:	learn: 0.0794001	total: 794ms	remaining: 874ms
238:	learn: 0.0793117	total: 797ms	remaining: 870ms
239:	learn: 0.0792522	total: 800ms	remaining: 867ms
240:	learn: 0.0790770	total: 803ms	remaining: 863ms
241:	learn: 0.0788081	total: 806ms	remaining: 860ms
242:	learn: 0.0787183	total: 809ms	remaining: 856ms
243:	learn: 0.0786071	total: 813ms	remaining: 853ms
244:	learn: 0.0784702	total: 816ms	remaining: 850ms
245:	learn: 0.0784237	total: 819ms	remaining: 846ms
246:	learn: 0.0783422	total: 822ms	remaining: 842ms
247:	learn: 0.0783110	total: 826ms	remaining: 839ms
248:	learn: 0.0782873	total: 829ms	remaining: 836ms
249:	learn: 0.0781498	total: 832ms	remaining: 832ms
250:	learn: 

405:	learn: 0.0653113	total: 1.27s	remaining: 293ms
406:	learn: 0.0652910	total: 1.27s	remaining: 290ms
407:	learn: 0.0652317	total: 1.27s	remaining: 287ms
408:	learn: 0.0650631	total: 1.27s	remaining: 284ms
409:	learn: 0.0650147	total: 1.28s	remaining: 280ms
410:	learn: 0.0649960	total: 1.28s	remaining: 277ms
411:	learn: 0.0649884	total: 1.28s	remaining: 274ms
412:	learn: 0.0648411	total: 1.29s	remaining: 271ms
413:	learn: 0.0645369	total: 1.29s	remaining: 268ms
414:	learn: 0.0644865	total: 1.29s	remaining: 265ms
415:	learn: 0.0644809	total: 1.3s	remaining: 262ms
416:	learn: 0.0644623	total: 1.3s	remaining: 259ms
417:	learn: 0.0643114	total: 1.3s	remaining: 256ms
418:	learn: 0.0642453	total: 1.3s	remaining: 252ms
419:	learn: 0.0642144	total: 1.31s	remaining: 249ms
420:	learn: 0.0641743	total: 1.31s	remaining: 246ms
421:	learn: 0.0640388	total: 1.31s	remaining: 243ms
422:	learn: 0.0639944	total: 1.32s	remaining: 240ms
423:	learn: 0.0638895	total: 1.32s	remaining: 237ms
424:	learn: 0.06

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


10:	learn: 0.3350970	total: 30.2ms	remaining: 1.34s
11:	learn: 0.3220101	total: 33.2ms	remaining: 1.35s
12:	learn: 0.3074832	total: 35.7ms	remaining: 1.34s
13:	learn: 0.2925878	total: 38.4ms	remaining: 1.33s
14:	learn: 0.2801095	total: 41.1ms	remaining: 1.33s
15:	learn: 0.2665553	total: 43.7ms	remaining: 1.32s
16:	learn: 0.2556920	total: 46.6ms	remaining: 1.32s
17:	learn: 0.2470651	total: 49ms	remaining: 1.31s
18:	learn: 0.2376718	total: 51.6ms	remaining: 1.3s
19:	learn: 0.2308109	total: 54.1ms	remaining: 1.3s
20:	learn: 0.2245920	total: 56.6ms	remaining: 1.29s
21:	learn: 0.2177025	total: 59.2ms	remaining: 1.28s
22:	learn: 0.2122590	total: 62ms	remaining: 1.28s
23:	learn: 0.2067411	total: 64.7ms	remaining: 1.28s
24:	learn: 0.2028503	total: 67.1ms	remaining: 1.27s
25:	learn: 0.1985547	total: 69.6ms	remaining: 1.27s
26:	learn: 0.1934243	total: 72.2ms	remaining: 1.26s
27:	learn: 0.1898006	total: 75.1ms	remaining: 1.26s
28:	learn: 0.1869932	total: 77.5ms	remaining: 1.26s
29:	learn: 0.18359

190:	learn: 0.0865126	total: 524ms	remaining: 848ms
191:	learn: 0.0859181	total: 527ms	remaining: 845ms
192:	learn: 0.0855369	total: 530ms	remaining: 843ms
193:	learn: 0.0853977	total: 533ms	remaining: 841ms
194:	learn: 0.0852912	total: 536ms	remaining: 838ms
195:	learn: 0.0851541	total: 539ms	remaining: 836ms
196:	learn: 0.0850239	total: 541ms	remaining: 833ms
197:	learn: 0.0849013	total: 545ms	remaining: 832ms
198:	learn: 0.0847948	total: 549ms	remaining: 830ms
199:	learn: 0.0845472	total: 552ms	remaining: 828ms
200:	learn: 0.0843875	total: 556ms	remaining: 826ms
201:	learn: 0.0839539	total: 559ms	remaining: 824ms
202:	learn: 0.0835402	total: 562ms	remaining: 823ms
203:	learn: 0.0832047	total: 566ms	remaining: 821ms
204:	learn: 0.0829788	total: 569ms	remaining: 819ms
205:	learn: 0.0829112	total: 572ms	remaining: 816ms
206:	learn: 0.0828711	total: 575ms	remaining: 813ms
207:	learn: 0.0826450	total: 578ms	remaining: 811ms
208:	learn: 0.0826150	total: 581ms	remaining: 809ms
209:	learn: 

370:	learn: 0.0666968	total: 1.18s	remaining: 411ms
371:	learn: 0.0666863	total: 1.18s	remaining: 408ms
372:	learn: 0.0664965	total: 1.19s	remaining: 405ms
373:	learn: 0.0664248	total: 1.19s	remaining: 402ms
374:	learn: 0.0663946	total: 1.2s	remaining: 398ms
375:	learn: 0.0662753	total: 1.2s	remaining: 395ms
376:	learn: 0.0659955	total: 1.2s	remaining: 392ms
377:	learn: 0.0658683	total: 1.21s	remaining: 389ms
378:	learn: 0.0658347	total: 1.21s	remaining: 386ms
379:	learn: 0.0657936	total: 1.21s	remaining: 383ms
380:	learn: 0.0657256	total: 1.22s	remaining: 380ms
381:	learn: 0.0656988	total: 1.22s	remaining: 376ms
382:	learn: 0.0656472	total: 1.22s	remaining: 373ms
383:	learn: 0.0655758	total: 1.22s	remaining: 370ms
384:	learn: 0.0655393	total: 1.23s	remaining: 366ms
385:	learn: 0.0654069	total: 1.23s	remaining: 363ms
386:	learn: 0.0653273	total: 1.23s	remaining: 360ms
387:	learn: 0.0652927	total: 1.24s	remaining: 357ms
388:	learn: 0.0652236	total: 1.24s	remaining: 353ms
389:	learn: 0.0

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


17:	learn: 0.2502836	total: 58.3ms	remaining: 1.56s
18:	learn: 0.2420223	total: 60.8ms	remaining: 1.54s
19:	learn: 0.2350565	total: 63.8ms	remaining: 1.53s
20:	learn: 0.2274649	total: 67.3ms	remaining: 1.53s
21:	learn: 0.2210278	total: 69.8ms	remaining: 1.51s
22:	learn: 0.2162756	total: 72.3ms	remaining: 1.5s
23:	learn: 0.2112538	total: 74.7ms	remaining: 1.48s
24:	learn: 0.2065692	total: 77.4ms	remaining: 1.47s
25:	learn: 0.2017964	total: 80ms	remaining: 1.46s
26:	learn: 0.1979481	total: 84.2ms	remaining: 1.47s
27:	learn: 0.1937323	total: 86.8ms	remaining: 1.46s
28:	learn: 0.1912205	total: 89.3ms	remaining: 1.45s
29:	learn: 0.1886006	total: 91.8ms	remaining: 1.44s
30:	learn: 0.1860664	total: 94.9ms	remaining: 1.44s
31:	learn: 0.1829977	total: 98.7ms	remaining: 1.44s
32:	learn: 0.1793761	total: 101ms	remaining: 1.43s
33:	learn: 0.1782562	total: 103ms	remaining: 1.42s
34:	learn: 0.1753309	total: 106ms	remaining: 1.4s
35:	learn: 0.1733147	total: 108ms	remaining: 1.39s
36:	learn: 0.1713819

185:	learn: 0.0936048	total: 553ms	remaining: 934ms
186:	learn: 0.0934404	total: 556ms	remaining: 931ms
187:	learn: 0.0932115	total: 560ms	remaining: 929ms
188:	learn: 0.0931166	total: 562ms	remaining: 925ms
189:	learn: 0.0928938	total: 565ms	remaining: 921ms
190:	learn: 0.0928691	total: 567ms	remaining: 918ms
191:	learn: 0.0927689	total: 570ms	remaining: 915ms
192:	learn: 0.0926207	total: 573ms	remaining: 911ms
193:	learn: 0.0924264	total: 575ms	remaining: 907ms
194:	learn: 0.0922088	total: 578ms	remaining: 905ms
195:	learn: 0.0921232	total: 581ms	remaining: 901ms
196:	learn: 0.0919986	total: 584ms	remaining: 898ms
197:	learn: 0.0918235	total: 586ms	remaining: 894ms
198:	learn: 0.0912145	total: 589ms	remaining: 891ms
199:	learn: 0.0910934	total: 591ms	remaining: 887ms
200:	learn: 0.0909081	total: 594ms	remaining: 883ms
201:	learn: 0.0908312	total: 596ms	remaining: 880ms
202:	learn: 0.0906366	total: 599ms	remaining: 877ms
203:	learn: 0.0905305	total: 602ms	remaining: 873ms
204:	learn: 

350:	learn: 0.0722644	total: 1.04s	remaining: 444ms
351:	learn: 0.0722509	total: 1.05s	remaining: 441ms
352:	learn: 0.0722362	total: 1.08s	remaining: 451ms
353:	learn: 0.0722326	total: 1.09s	remaining: 450ms
354:	learn: 0.0722249	total: 1.1s	remaining: 448ms
355:	learn: 0.0722060	total: 1.1s	remaining: 446ms
356:	learn: 0.0721973	total: 1.11s	remaining: 443ms
357:	learn: 0.0721497	total: 1.11s	remaining: 442ms
358:	learn: 0.0720297	total: 1.12s	remaining: 441ms
359:	learn: 0.0719665	total: 1.13s	remaining: 440ms
360:	learn: 0.0719469	total: 1.14s	remaining: 437ms
361:	learn: 0.0718222	total: 1.14s	remaining: 435ms
362:	learn: 0.0717986	total: 1.14s	remaining: 432ms
363:	learn: 0.0717815	total: 1.15s	remaining: 430ms
364:	learn: 0.0717640	total: 1.15s	remaining: 427ms
365:	learn: 0.0716568	total: 1.16s	remaining: 424ms
366:	learn: 0.0715880	total: 1.16s	remaining: 422ms
367:	learn: 0.0715660	total: 1.17s	remaining: 419ms
368:	learn: 0.0715331	total: 1.17s	remaining: 416ms
369:	learn: 0.

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


1:	learn: 0.5837142	total: 7.19ms	remaining: 1.79s
2:	learn: 0.5391826	total: 11.4ms	remaining: 1.89s
3:	learn: 0.5027123	total: 15.1ms	remaining: 1.88s
4:	learn: 0.4712799	total: 18.3ms	remaining: 1.81s
5:	learn: 0.4411172	total: 22ms	remaining: 1.81s
6:	learn: 0.4113280	total: 26.6ms	remaining: 1.87s
7:	learn: 0.3852435	total: 30ms	remaining: 1.84s
8:	learn: 0.3646132	total: 34.1ms	remaining: 1.86s
9:	learn: 0.3460692	total: 38.7ms	remaining: 1.9s
10:	learn: 0.3286414	total: 42.4ms	remaining: 1.89s
11:	learn: 0.3174895	total: 47ms	remaining: 1.91s
12:	learn: 0.3030015	total: 50.1ms	remaining: 1.88s
13:	learn: 0.2907828	total: 55.5ms	remaining: 1.93s
14:	learn: 0.2783493	total: 59ms	remaining: 1.91s
15:	learn: 0.2658933	total: 62.5ms	remaining: 1.89s
16:	learn: 0.2557522	total: 66.3ms	remaining: 1.88s
17:	learn: 0.2468463	total: 69.9ms	remaining: 1.87s
18:	learn: 0.2377351	total: 72.5ms	remaining: 1.84s
19:	learn: 0.2319451	total: 75.7ms	remaining: 1.82s
20:	learn: 0.2245307	total: 78

208:	learn: 0.0886866	total: 665ms	remaining: 926ms
209:	learn: 0.0885727	total: 668ms	remaining: 922ms
210:	learn: 0.0885186	total: 670ms	remaining: 918ms
211:	learn: 0.0884008	total: 672ms	remaining: 914ms
212:	learn: 0.0883278	total: 675ms	remaining: 909ms
213:	learn: 0.0880841	total: 677ms	remaining: 905ms
214:	learn: 0.0880152	total: 680ms	remaining: 901ms
215:	learn: 0.0879829	total: 683ms	remaining: 898ms
216:	learn: 0.0878303	total: 686ms	remaining: 895ms
217:	learn: 0.0877540	total: 689ms	remaining: 892ms
218:	learn: 0.0876858	total: 692ms	remaining: 888ms
219:	learn: 0.0874270	total: 695ms	remaining: 884ms
220:	learn: 0.0873589	total: 697ms	remaining: 880ms
221:	learn: 0.0872624	total: 701ms	remaining: 878ms
222:	learn: 0.0871955	total: 704ms	remaining: 874ms
223:	learn: 0.0870530	total: 707ms	remaining: 871ms
224:	learn: 0.0869468	total: 710ms	remaining: 868ms
225:	learn: 0.0869019	total: 714ms	remaining: 865ms
226:	learn: 0.0866972	total: 716ms	remaining: 862ms
227:	learn: 

411:	learn: 0.0675561	total: 1.32s	remaining: 283ms
412:	learn: 0.0675307	total: 1.33s	remaining: 280ms
413:	learn: 0.0674658	total: 1.33s	remaining: 277ms
414:	learn: 0.0672636	total: 1.33s	remaining: 274ms
415:	learn: 0.0671999	total: 1.34s	remaining: 270ms
416:	learn: 0.0671487	total: 1.34s	remaining: 267ms
417:	learn: 0.0671354	total: 1.35s	remaining: 264ms
418:	learn: 0.0670270	total: 1.35s	remaining: 261ms
419:	learn: 0.0669829	total: 1.35s	remaining: 258ms
420:	learn: 0.0669656	total: 1.36s	remaining: 255ms
421:	learn: 0.0669450	total: 1.36s	remaining: 252ms
422:	learn: 0.0669154	total: 1.36s	remaining: 248ms
423:	learn: 0.0668780	total: 1.37s	remaining: 245ms
424:	learn: 0.0668316	total: 1.37s	remaining: 242ms
425:	learn: 0.0668211	total: 1.37s	remaining: 239ms
426:	learn: 0.0667774	total: 1.38s	remaining: 235ms
427:	learn: 0.0666844	total: 1.38s	remaining: 232ms
428:	learn: 0.0665217	total: 1.38s	remaining: 229ms
429:	learn: 0.0664372	total: 1.39s	remaining: 225ms
430:	learn: 

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)


1:	learn: 0.5870487	total: 6.48ms	remaining: 1.61s
2:	learn: 0.5460471	total: 9.24ms	remaining: 1.53s
3:	learn: 0.5069377	total: 13ms	remaining: 1.61s
4:	learn: 0.4757067	total: 16.4ms	remaining: 1.63s
5:	learn: 0.4530243	total: 19.5ms	remaining: 1.61s
6:	learn: 0.4240451	total: 22.2ms	remaining: 1.56s
7:	learn: 0.3978581	total: 25.5ms	remaining: 1.57s
8:	learn: 0.3747219	total: 29.3ms	remaining: 1.6s
9:	learn: 0.3517231	total: 32.4ms	remaining: 1.59s
10:	learn: 0.3336338	total: 35.3ms	remaining: 1.57s
11:	learn: 0.3198472	total: 37.9ms	remaining: 1.54s
12:	learn: 0.3062924	total: 41.4ms	remaining: 1.55s
13:	learn: 0.2926231	total: 45.3ms	remaining: 1.57s
14:	learn: 0.2794641	total: 47.9ms	remaining: 1.55s
15:	learn: 0.2697496	total: 50.5ms	remaining: 1.53s
16:	learn: 0.2596567	total: 53.6ms	remaining: 1.52s
17:	learn: 0.2502851	total: 57.6ms	remaining: 1.54s
18:	learn: 0.2431184	total: 60.4ms	remaining: 1.53s
19:	learn: 0.2361615	total: 63.8ms	remaining: 1.53s
20:	learn: 0.2305478	tot

197:	learn: 0.0907275	total: 661ms	remaining: 1.01s
198:	learn: 0.0905696	total: 664ms	remaining: 1s
199:	learn: 0.0904483	total: 668ms	remaining: 1s
200:	learn: 0.0901411	total: 670ms	remaining: 997ms
201:	learn: 0.0899176	total: 675ms	remaining: 995ms
202:	learn: 0.0897937	total: 678ms	remaining: 992ms
203:	learn: 0.0896001	total: 681ms	remaining: 987ms
204:	learn: 0.0894965	total: 683ms	remaining: 983ms
205:	learn: 0.0891722	total: 686ms	remaining: 979ms
206:	learn: 0.0884888	total: 689ms	remaining: 975ms
207:	learn: 0.0881661	total: 692ms	remaining: 971ms
208:	learn: 0.0880010	total: 695ms	remaining: 967ms
209:	learn: 0.0877739	total: 698ms	remaining: 963ms
210:	learn: 0.0874723	total: 700ms	remaining: 959ms
211:	learn: 0.0869466	total: 704ms	remaining: 956ms
212:	learn: 0.0866434	total: 707ms	remaining: 953ms
213:	learn: 0.0865122	total: 710ms	remaining: 949ms
214:	learn: 0.0863178	total: 713ms	remaining: 945ms
215:	learn: 0.0860674	total: 716ms	remaining: 942ms
216:	learn: 0.0857

357:	learn: 0.0650501	total: 1.15s	remaining: 457ms
358:	learn: 0.0650257	total: 1.16s	remaining: 454ms
359:	learn: 0.0649333	total: 1.16s	remaining: 451ms
360:	learn: 0.0648060	total: 1.16s	remaining: 447ms
361:	learn: 0.0647923	total: 1.16s	remaining: 444ms
362:	learn: 0.0646839	total: 1.17s	remaining: 440ms
363:	learn: 0.0646067	total: 1.17s	remaining: 437ms
364:	learn: 0.0645597	total: 1.17s	remaining: 434ms
365:	learn: 0.0645364	total: 1.18s	remaining: 431ms
366:	learn: 0.0644906	total: 1.18s	remaining: 427ms
367:	learn: 0.0644390	total: 1.18s	remaining: 424ms
368:	learn: 0.0644011	total: 1.19s	remaining: 421ms
369:	learn: 0.0643925	total: 1.19s	remaining: 418ms
370:	learn: 0.0642950	total: 1.19s	remaining: 414ms
371:	learn: 0.0642624	total: 1.19s	remaining: 411ms
372:	learn: 0.0641935	total: 1.2s	remaining: 407ms
373:	learn: 0.0639846	total: 1.2s	remaining: 404ms
374:	learn: 0.0639478	total: 1.2s	remaining: 401ms
375:	learn: 0.0639457	total: 1.21s	remaining: 398ms
376:	learn: 0.0

### 제출용 모델 생성 및 예측값 출력

In [133]:
sel = ['총세대수', '전용면적', '공가수', '단지내주차면수','qcut_총세대수', '자격유형_lbl', 
       '전용면적별세대수', '10분내버스정류장수', '10분내지하철수', '임대건물구분_lbl', 
       '공급유형_lbl', '지역_lbl', '단지코드_lbl','단지코드_Type','교통편의성', 
       '총세대수주차면수','임대료','임대보증금']

X = train[sel]
y = train['log_등록차량수']
test_X = test[sel]

use_model = 'CatBoostRegressor'

model = CatBoostRegressor( 
         loss_function='MAE',
         n_estimators=500, 
         learning_rate=0.1, 
         random_state=42)

model.fit(X, y)
pred = model.predict(test_X)

0:	learn: 0.6529681	total: 4.2ms	remaining: 2.1s
1:	learn: 0.5999176	total: 8.15ms	remaining: 2.03s
2:	learn: 0.5570551	total: 13.6ms	remaining: 2.26s
3:	learn: 0.5197002	total: 17.7ms	remaining: 2.2s
4:	learn: 0.4861916	total: 21ms	remaining: 2.08s
5:	learn: 0.4583342	total: 25.2ms	remaining: 2.08s
6:	learn: 0.4273082	total: 29ms	remaining: 2.04s
7:	learn: 0.3984806	total: 33.6ms	remaining: 2.07s
8:	learn: 0.3759088	total: 37ms	remaining: 2.02s
9:	learn: 0.3536922	total: 41.6ms	remaining: 2.04s
10:	learn: 0.3368792	total: 45.2ms	remaining: 2.01s
11:	learn: 0.3221285	total: 49.2ms	remaining: 2s
12:	learn: 0.3099161	total: 53.7ms	remaining: 2.01s
13:	learn: 0.2956594	total: 58.1ms	remaining: 2.02s
14:	learn: 0.2829457	total: 61.5ms	remaining: 1.99s
15:	learn: 0.2730411	total: 65.2ms	remaining: 1.97s
16:	learn: 0.2622000	total: 68.3ms	remaining: 1.94s
17:	learn: 0.2519463	total: 71.9ms	remaining: 1.92s
18:	learn: 0.2457931	total: 75.6ms	remaining: 1.91s
19:	learn: 0.2396410	total: 78.4ms

182:	learn: 0.0952387	total: 674ms	remaining: 1.17s
183:	learn: 0.0951295	total: 679ms	remaining: 1.17s
184:	learn: 0.0948969	total: 685ms	remaining: 1.17s
185:	learn: 0.0944173	total: 690ms	remaining: 1.16s
186:	learn: 0.0940758	total: 693ms	remaining: 1.16s
187:	learn: 0.0939453	total: 698ms	remaining: 1.16s
188:	learn: 0.0936873	total: 702ms	remaining: 1.16s
189:	learn: 0.0935364	total: 707ms	remaining: 1.15s
190:	learn: 0.0932713	total: 711ms	remaining: 1.15s
191:	learn: 0.0932147	total: 714ms	remaining: 1.15s
192:	learn: 0.0929792	total: 719ms	remaining: 1.14s
193:	learn: 0.0928691	total: 721ms	remaining: 1.14s
194:	learn: 0.0925971	total: 725ms	remaining: 1.13s
195:	learn: 0.0925545	total: 729ms	remaining: 1.13s
196:	learn: 0.0925150	total: 732ms	remaining: 1.13s
197:	learn: 0.0924381	total: 736ms	remaining: 1.12s
198:	learn: 0.0922702	total: 739ms	remaining: 1.12s
199:	learn: 0.0921353	total: 758ms	remaining: 1.14s
200:	learn: 0.0919557	total: 762ms	remaining: 1.13s
201:	learn: 

346:	learn: 0.0732900	total: 1.35s	remaining: 594ms
347:	learn: 0.0730352	total: 1.35s	remaining: 591ms
348:	learn: 0.0728526	total: 1.36s	remaining: 587ms
349:	learn: 0.0728265	total: 1.36s	remaining: 583ms
350:	learn: 0.0727656	total: 1.36s	remaining: 579ms
351:	learn: 0.0727466	total: 1.37s	remaining: 576ms
352:	learn: 0.0726975	total: 1.37s	remaining: 572ms
353:	learn: 0.0726775	total: 1.38s	remaining: 568ms
354:	learn: 0.0726335	total: 1.38s	remaining: 563ms
355:	learn: 0.0723940	total: 1.38s	remaining: 560ms
356:	learn: 0.0722989	total: 1.39s	remaining: 556ms
357:	learn: 0.0722136	total: 1.39s	remaining: 552ms
358:	learn: 0.0720430	total: 1.39s	remaining: 548ms
359:	learn: 0.0719695	total: 1.4s	remaining: 544ms
360:	learn: 0.0719512	total: 1.4s	remaining: 540ms
361:	learn: 0.0718976	total: 1.41s	remaining: 536ms
362:	learn: 0.0718271	total: 1.41s	remaining: 532ms
363:	learn: 0.0718016	total: 1.41s	remaining: 528ms
364:	learn: 0.0717753	total: 1.42s	remaining: 524ms
365:	learn: 0.

### 등록차량 log값 복구 및 단지코드별 등록차량 평균값 출력

In [134]:
test['등록차량수'] = np.expm1(pred)   # np.log1 복구
test['단지별차량수평균'] = test.groupby("단지코드")['등록차량수'].transform(np.mean)
test_new = test.drop_duplicates(['단지코드'], keep='first').reset_index() # 중복 제거
test_new

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,index,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,...,임대건물구분_lbl,지역_lbl,공급유형_lbl,자격유형_lbl,단지코드_lbl,qcut_총세대수,교통편의성,총세대수주차면수,등록차량수,단지별차량수평균
0,2896,,,,,,,,,,...,,,,,,,,,57.103876,
1,3064,C1156,1004.0,아파트,충청북도,행복주택,36.53,256.0,47.0,K,...,1.0,11.0,9.0,11.0,56.0,3.0,3.0,0.697211,499.187041,498.817825
2,3066,C2142,954.0,아파트,울산광역시,국민임대,26.96,268.0,41.0,H,...,1.0,10.0,1.0,8.0,372.0,3.0,5.0,0.809224,536.686909,523.339754
3,3070,C2153,362.0,아파트,전라남도,국민임대,27.82,88.0,20.0,H,...,1.0,12.0,1.0,8.0,376.0,0.0,3.0,0.953039,303.375818,316.044467
4,3074,C2186,924.0,아파트,대구광역시,국민임대,29.17,238.0,0.0,H,...,1.0,14.0,1.0,8.0,385.0,3.0,8.0,0.718615,953.0064,791.620582
5,3080,C1176,1486.0,아파트,충청북도,국민임대,26.58,199.0,55.0,H,...,1.0,11.0,1.0,8.0,63.0,4.0,3.0,1.004711,956.317019,940.248472
6,3088,C2446,2200.0,아파트,경기도,행복주택,16.95,12.0,32.0,N,...,1.0,3.0,9.0,14.0,473.0,4.0,5.0,0.713636,644.446149,714.905143
7,3100,C2586,90.0,아파트,제주특별자치도,행복주택,16.64,12.0,7.0,J,...,1.0,9.0,9.0,10.0,530.0,0.0,3.0,0.733333,76.152057,85.097225
8,3105,C2035,492.0,아파트,강원도,국민임대,29.53,132.0,24.0,A,...,1.0,5.0,1.0,1.0,331.0,1.0,1.0,1.058943,429.97562,467.827855
9,3110,C2020,40.0,아파트,부산광역시,행복주택,16.8,20.0,7.0,L,...,1.0,8.0,9.0,12.0,325.0,0.0,3.0,0.625,34.248367,35.539254


In [135]:
# 제출용 데이터 셋 처리 완료
add_dat = {'code':['C2675', 'C2335', 'C1327'],
           'num':['0', '0', '0']}
add_df = pd.DataFrame(add_dat)

In [136]:
sub_df = test_new[ ['단지코드', '단지별차량수평균']]
sub_df.columns = ['code', 'num']
sub_df = pd.concat([sub_df, add_df]).reset_index()
sub_df = sub_df.drop(['index'], axis=1)
sub_df

Unnamed: 0,code,num
0,,
1,C1156,498.817825
2,C2142,523.339754
3,C2153,316.044467
4,C2186,791.620582
5,C1176,940.248472
6,C2446,714.905143
7,C2586,85.097225
8,C2035,467.827855
9,C2020,35.539254


### 제출용 파일생성(년-월-일-시-분-초)

In [137]:
# 연월일 문자
day = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())

PATH = './Output/'
file_name = f'{day}_{use_model}.csv'
sub_df.to_csv(f'{PATH}{file_name}', index=False)

if file_name in os.listdir(PATH) :
    print(f'{file_name} 저장 완료')
else : 
    print(f'{file_name} 저장 실패')

2021-08-09-17-03-11_CatBoostRegressor.csv 저장 완료
