In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:.3f}'.format

# 데이터 불러오기

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
car_2020 = pd.read_csv('./data/car_2020.csv',encoding='cp949')
submission = pd.read_csv('./data/sample_submission.csv')

# 컬렴명 바꿔주기

In [4]:
train.columns = ['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '신분',
                 '임대보증금', '임대료', '지하철', '버스', '단지내주차면수', '등록차량수' ]


# 지하철 Y or N로 변경

In [5]:
train['지하철'] = train['지하철'].fillna(0)
train['지하철'] = np.where(train['지하철'] == 0, 0, 1)

# 결측치 처리

In [6]:
train.loc[train.임대보증금=='-', '임대보증금'] = np.nan
train.loc[train.임대료=='-', '임대료'] = np.nan
train['임대보증금'] = train['임대보증금'].astype(float)
train['임대료'] = train['임대료'].astype(float)
train['임대보증금'] = train['임대보증금'].fillna(0)
train['임대료'] = train['임대료'].fillna(0)

In [7]:
train['버스'] = train['버스'].fillna(train['버스'].mean())

# 임대건물구분 : 아파트->상가 / 전용면적별세대수 : x->1

In [8]:
idx = train[(train['임대건물구분']=='아파트') & (train['신분']=='D')]['전용면적별세대수'].index
train.loc[idx, '전용면적별세대수'] = 1
train.loc[idx, '임대건물구분'] = '상가'

# 지역명 숫자로 매핑

In [9]:
local_map = {}
for i, loc in enumerate(train['지역'].unique()):
    local_map[loc] = i
    
train['지역'] = train['지역'].map(local_map)

# 총세대수 scale

In [10]:
train['총세대수'] = np.log(train['총세대수'])

# 

In [11]:
train_copy = train.copy()

del train_copy['신분'], train_copy['단지코드'], train_copy['임대건물구분'], train_copy['공급유형']

# Model Fitting

In [12]:
x_train = train_copy.iloc[:, :-1]
y_train = train_copy.iloc[:,-1]

forest = RandomForestRegressor(n_jobs=-1, random_state=42)
forest.fit(x_train, y_train)

RandomForestRegressor(n_jobs=-1, random_state=42)

In [13]:
x_train.columns

Index(['총세대수', '지역', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '지하철', '버스',
       '단지내주차면수'],
      dtype='object')

# test 불러오기

In [14]:
test.columns = [ '단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '신분',
                '임대보증금', '임대료', '지하철', '버스', '단지내주차면수' ]

In [15]:
test.loc[test.임대보증금=='-', '임대보증금'] = np.nan
test.loc[test.임대료=='-', '임대료'] = np.nan
test['임대보증금'] = test['임대보증금'].fillna(0)
test['임대료'] = test['임대료'].fillna(0)
test['임대보증금'] = test['임대보증금'].astype(float)
test['임대료'] = test['임대료'].astype(float)

In [16]:
test['지하철'] = test['지하철'].fillna(0)
test['지하철'] = np.where(test['지하철'] == 0, 0, 1)

In [17]:
test.loc[test.단지코드.isin(['C2411']) & test.신분.isnull(), '신분'] = 'A'
test.loc[test.단지코드.isin(['C2253']) & test.신분.isnull(), '신분'] = 'C'

In [18]:
local_map = {}
for i, loc in enumerate(test['지역'].unique()):
    local_map[loc] = i
    
test['지역'] = test['지역'].map(local_map)

In [19]:
test = test[['단지코드', '총세대수', '지역', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '지하철', '버스', '단지내주차면수']]

In [20]:
print('단지코드의 Unique : ', len(test['단지코드'].unique()))
print('총세대수의 Unique : ', len(test['총세대수'].unique()))

단지코드의 Unique :  150
총세대수의 Unique :  144


In [24]:
# ['총세대수', '지역', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '지하철', '버스', '단지내주차면수']
test.head(10)

Unnamed: 0,단지코드,총세대수,지역,전용면적,전용면적별세대수,공가수,임대보증금,임대료,지하철,버스,단지내주차면수
0,C1072,754,0,39.79,116,14.0,22830000.0,189840.0,0,2.0,683.0
1,C1072,754,0,46.81,30,14.0,36048000.0,249930.0,0,2.0,683.0
2,C1072,754,0,46.9,112,14.0,36048000.0,249930.0,0,2.0,683.0
3,C1072,754,0,46.9,120,14.0,36048000.0,249930.0,0,2.0,683.0
4,C1072,754,0,51.46,60,14.0,43497000.0,296780.0,0,2.0,683.0
5,C1072,754,0,51.71,51,14.0,43497000.0,296780.0,0,2.0,683.0
6,C1072,754,0,51.96,198,14.0,43497000.0,296780.0,0,2.0,683.0
7,C1072,754,0,51.96,67,14.0,43497000.0,296780.0,0,2.0,683.0
8,C1128,1354,0,39.79,368,9.0,22830000.0,189840.0,0,3.0,1216.0
9,C1128,1354,0,39.79,30,9.0,22830000.0,189840.0,0,3.0,1216.0


In [27]:
fixedData_colName = ['단지코드', '총세대수', '지역', '공가수', '지하철', '버스', '단지내주차면수']
UnfixedData_colName = ['전용면적', '전용면적별세대수', '임대보증금', '임대료']
new_test = pd.DataFrame(columns=test.columns)

test_copy = test.copy()

for idx, danji_code in enumerate(test['단지코드'].unique()):
    
    temp_df = test_copyp[test_copy['단지코드'] == danji_code]
    
    # columns name 가져와서 넣기
    for fixed_colName in fixedData_colName:
        new_test.loc[inx, fixed_colName] = temp_df[fixed_colName][0]

KeyError: 'C1072'

# Predict

In [21]:
pred = forest.predict(x_test)

NameError: name 'x_test' is not defined

In [None]:
pred

In [None]:
submission['num'] = pred

In [None]:
submission

In [None]:
# submission.to_csv('baseline.csv', index=False)