In [51]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [52]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.float_format = '{:.5f}'.format

# 데이터 불러오기

In [73]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
car_2020 = pd.read_csv('data/car_2020.csv',encoding='cp949')
age = pd.read_csv('data/age_gender_info.csv')
submission = pd.read_csv('data/sample_submission.csv')

In [74]:
train[train['임대보증금']=='-']

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수
2547,C1326,1934,아파트,부산광역시,국민임대,24.72,472,43.0,H,-,-,0.0,4.0,1670.0,1153.0
2548,C1326,1934,아파트,부산광역시,국민임대,24.79,104,43.0,H,-,-,0.0,4.0,1670.0,1153.0
2549,C1326,1934,아파트,부산광역시,국민임대,26.83,590,43.0,H,-,-,0.0,4.0,1670.0,1153.0
2550,C1326,1934,아파트,부산광역시,국민임대,37.7,464,43.0,H,-,-,0.0,4.0,1670.0,1153.0
2551,C1326,1934,아파트,부산광역시,국민임대,46.94,304,43.0,H,-,-,0.0,4.0,1670.0,1153.0
2680,C1786,480,아파트,강원도,행복주택,16.91,156,25.0,K,-,-,0.0,3.0,338.0,345.0
2681,C1786,480,아파트,강원도,행복주택,26.9,136,25.0,K,-,-,0.0,3.0,338.0,345.0
2682,C1786,480,아파트,강원도,행복주택,26.9,72,25.0,K,-,-,0.0,3.0,338.0,345.0
2683,C1786,480,아파트,강원도,행복주택,26.9,24,25.0,K,-,-,0.0,3.0,338.0,345.0
2906,C2186,924,아파트,대구광역시,국민임대,29.17,238,0.0,H,-,-,0.0,8.0,664.0,744.0


In [54]:
train.columns

Index(['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수',
       '자격유형', '임대보증금', '임대료', '도보 10분거리 내 지하철역 수(환승노선 수 반영)',
       '도보 10분거리 내 버스정류장 수', '단지내주차면수', '등록차량수'],
      dtype='object')

In [55]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2952 entries, 0 to 2951
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   단지코드                          2952 non-null   object 
 1   총세대수                          2952 non-null   int64  
 2   임대건물구분                        2952 non-null   object 
 3   지역                            2952 non-null   object 
 4   공급유형                          2952 non-null   object 
 5   전용면적                          2952 non-null   float64
 6   전용면적별세대수                      2952 non-null   int64  
 7   공가수                           2952 non-null   float64
 8   자격유형                          2952 non-null   object 
 9   임대보증금                         2383 non-null   object 
 10  임대료                           2383 non-null   object 
 11  도보 10분거리 내 지하철역 수(환승노선 수 반영)  2741 non-null   float64
 12  도보 10분거리 내 버스정류장 수            2948 non-null   float64
 13  단지내

# 수치형 컬럼만 가져오기

In [56]:
train = train[['총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '단지내주차면수', '등록차량수']]

In [57]:
train.columns

Index(['총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '단지내주차면수', '등록차량수'], dtype='object')

# 결측치 처리

In [58]:
train.loc[train.임대보증금=='-', '임대보증금'] = np.nan
train.loc[train.임대료=='-', '임대료'] = np.nan
train['임대보증금'] = train['임대보증금'].astype(float)
train['임대료'] = train['임대료'].astype(float)
train['임대보증금'] = train['임대보증금'].fillna(0)
train['임대료'] = train['임대료'].fillna(0)

# 임대건물구분 : 아파트->상가 / 전용면적별세대수 : x->1

In [59]:
# 점수를 대폭 줄였던 부분
# idx = train[(train['임대건물구분']=='아파트') & (train['D']=='D')]['전용면적별세대수'].index
# train.loc[idx, '전용면적별세대수'] = 1
# train.loc[idx, '임대건물구분'] = '상가'

# 전용면적 상/하한 적용

In [60]:
idx = train[train['전용면적']>100].index
train.loc[idx, '전용면적'] = 100

idx = train[train['전용면적']<15].index
train.loc[idx, '전용면적'] = 15

# Scaling

In [70]:
from sklearn.preprocessing import MinMaxScaler

train_copy = train.copy()
scaler = MinMaxScaler()
scaler.fit(train_copy)
train_copy_scaled = scaler.transform(train_copy)

# Test Data 전처리

In [62]:
test = test[['총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '단지내주차면수']]

In [63]:
test.loc[test['임대보증금']=='-', '임대보증금'] = np.nan
test.loc[test['임대료']=='-', '임대료'] = np.nan
test['임대보증금'] = test['임대보증금'].fillna(0)
test['임대료'] = test['임대료'].fillna(0)
test['임대보증금'] = test['임대보증금'].astype(float)
test['임대료'] = test['임대료'].astype(float)

In [64]:
idx = test[test['전용면적']>100].index
test.loc[idx, '전용면적'] = 100

idx = test[test['전용면적']<15].index
test.loc[idx, '전용면적'] = 15

In [65]:
train.head(2)

Unnamed: 0,총세대수,전용면적,전용면적별세대수,공가수,임대보증금,임대료,단지내주차면수,등록차량수
0,900,39.72,134,38.0,15667000.0,103680.0,1425.0,1015.0
1,900,39.72,15,38.0,15667000.0,103680.0,1425.0,1015.0


# Training / Test Data Split

In [66]:
x_train = new_train.iloc[:, :-1]
y_train = new_train.iloc[:,-1]
x_test = new_test

NameError: name 'new_train' is not defined

# Model 학습

In [None]:
forest = RandomForestRegressor(n_jobs=-1, random_state=42)
forest.fit(x_train, y_train)

# Prediction

In [None]:
pred = forest.predict(x_test)

In [None]:
submission['num'] = pred

# Save file

In [None]:
# submission.to_csv('./신분OneHot.csv', index=False)