In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
import matplotlib.pyplot as plt
pd.options.display.float_format = '{:.3f}'.format

# 데이터 불러오기

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
car_2020 = pd.read_csv('./data/car_2020.csv',encoding='cp949')
submission = pd.read_csv('./data/sample_submission.csv')

# 컬렴명 바꿔주기

In [4]:
train.columns = ['단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '신분',
                 '임대보증금', '임대료', '지하철', '버스', '단지내주차면수', '등록차량수' ]


# 지하철 Y or N로 변경

In [5]:
train['지하철'] = train['지하철'].fillna(0)
train['지하철'] = np.where(train['지하철'] == 0, 0, 1)

# 결측치 처리

In [6]:
train.loc[train.임대보증금=='-', '임대보증금'] = np.nan
train.loc[train.임대료=='-', '임대료'] = np.nan
train['임대보증금'] = train['임대보증금'].astype(float)
train['임대료'] = train['임대료'].astype(float)
train['임대보증금'] = train['임대보증금'].fillna(0)
train['임대료'] = train['임대료'].fillna(0)

In [7]:
train['버스'] = train['버스'].fillna(train['버스'].mean())

# 임대건물구분 : 아파트->상가 / 전용면적별세대수 : x->1

In [8]:
idx = train[(train['임대건물구분']=='아파트') & (train['신분']=='D')]['전용면적별세대수'].index
train.loc[idx, '전용면적별세대수'] = 1
train.loc[idx, '임대건물구분'] = '상가'

# 지역명 숫자로 매핑

In [9]:
local_map = {}
for i, loc in enumerate(train['지역'].unique()):
    local_map[loc] = i
    
train['지역'] = train['지역'].map(local_map)

# 총세대수 scale

In [10]:
train['총세대수'] = np.log(train['총세대수'])

# 

In [11]:
train_copy = train.copy()

del train_copy['신분'], train_copy['단지코드'], train_copy['임대건물구분'], train_copy['공급유형']

# Model Fitting

In [12]:
x_train = train_copy.iloc[:, :-1]
y_train = train_copy.iloc[:,-1]

forest = RandomForestRegressor(n_jobs=-1, random_state=42)
forest.fit(x_train, y_train)

RandomForestRegressor(n_jobs=-1, random_state=42)

In [13]:
x_train.columns

Index(['총세대수', '지역', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '지하철', '버스',
       '단지내주차면수'],
      dtype='object')

# test 불러오기

In [14]:
test.columns = [ '단지코드', '총세대수', '임대건물구분', '지역', '공급유형', '전용면적', '전용면적별세대수', '공가수', '신분',
                '임대보증금', '임대료', '지하철', '버스', '단지내주차면수' ]

In [15]:
test.loc[test.임대보증금=='-', '임대보증금'] = np.nan
test.loc[test.임대료=='-', '임대료'] = np.nan
test['임대보증금'] = test['임대보증금'].fillna(0)
test['임대료'] = test['임대료'].fillna(0)
test['임대보증금'] = test['임대보증금'].astype(float)
test['임대료'] = test['임대료'].astype(float)

In [16]:
test['지하철'] = test['지하철'].fillna(0)
test['지하철'] = np.where(test['지하철'] == 0, 0, 1)

In [17]:
test.loc[test.단지코드.isin(['C2411']) & test.신분.isnull(), '신분'] = 'A'
test.loc[test.단지코드.isin(['C2253']) & test.신분.isnull(), '신분'] = 'C'

In [18]:
local_map = {}
for i, loc in enumerate(test['지역'].unique()):
    local_map[loc] = i
    
test['지역'] = test['지역'].map(local_map)

In [19]:
test = test[['단지코드', '총세대수', '지역', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '지하철', '버스', '단지내주차면수']]

In [20]:
print('단지코드의 Unique : ', len(test['단지코드'].unique()))
print('총세대수의 Unique : ', len(test['총세대수'].unique()))

단지코드의 Unique :  150
총세대수의 Unique :  144


In [21]:
# ['총세대수', '지역', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '지하철', '버스', '단지내주차면수']
test.head(10)

Unnamed: 0,단지코드,총세대수,지역,전용면적,전용면적별세대수,공가수,임대보증금,임대료,지하철,버스,단지내주차면수
0,C1072,754,0,39.79,116,14.0,22830000.0,189840.0,0,2.0,683.0
1,C1072,754,0,46.81,30,14.0,36048000.0,249930.0,0,2.0,683.0
2,C1072,754,0,46.9,112,14.0,36048000.0,249930.0,0,2.0,683.0
3,C1072,754,0,46.9,120,14.0,36048000.0,249930.0,0,2.0,683.0
4,C1072,754,0,51.46,60,14.0,43497000.0,296780.0,0,2.0,683.0
5,C1072,754,0,51.71,51,14.0,43497000.0,296780.0,0,2.0,683.0
6,C1072,754,0,51.96,198,14.0,43497000.0,296780.0,0,2.0,683.0
7,C1072,754,0,51.96,67,14.0,43497000.0,296780.0,0,2.0,683.0
8,C1128,1354,0,39.79,368,9.0,22830000.0,189840.0,0,3.0,1216.0
9,C1128,1354,0,39.79,30,9.0,22830000.0,189840.0,0,3.0,1216.0


In [24]:
fixedData_colName = ['단지코드', '총세대수', '지역', '공가수', '지하철', '버스', '단지내주차면수']
UnfixedData_colName = ['전용면적', '전용면적별세대수', '임대보증금', '임대료']
new_test = pd.DataFrame(columns=test.columns)

test_copy = test.copy()

for idx, danji_code in enumerate(test['단지코드'].unique()):
    
    temp_df = test_copy[test_copy['단지코드'] == danji_code]
    temp_df.index = range(temp_df.shape[0]) # index 재설정
    
    # columns name 가져와서 넣기
    for fixed_colName in fixedData_colName:
        new_test.loc[idx, fixed_colName] = temp_df.loc[0, fixed_colName]
    
    for unfixed_colName in UnfixedData_colName:
        if unfixed_colName == '전용면적별세대수': # 합
            temp_df[unfixed_colName] = temp_df[unfixed_colName].mean()
        else:                                   # 평균 : 전용면적, 임대보증금, 임대료
            temp_df[unfixed_colName] = temp_df[unfixed_colName].mean()
        
        new_test.loc[idx, unfixed_colName] = temp_df.loc[0, unfixed_colName]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Predict

In [27]:
del new_test['단지코드']
pred = forest.predict(new_test)

In [28]:
pred

array([ 619.23, 1171.64,  373.04,  382.26,  980.15, 1873.27, 1020.15,
        154.21,  155.05,  195.57,  156.97,  140.88,  184.22,  253.65,
        129.8 ,  236.91,  351.56,  274.41,  176.08,  718.87,  237.69,
        219.85,  222.69,  399.29,  184.41,  125.5 ,  125.47,  401.5 ,
        392.86,  409.76, 1005.48,   89.7 ,  363.06,  236.17,   90.33,
        271.5 ,  183.16,  416.94,  870.94,  245.03,  146.58,  180.54,
        284.87,  391.79,  618.87, 1347.91,  182.1 ,  368.53,  249.4 ,
        184.34,  653.05,  195.55, 1156.18,  562.15,  392.64,  201.25,
        390.21,  223.1 ,  145.69,   99.58,  248.58,  180.91, 1038.23,
        293.92,   98.1 ,  274.41,  389.63, 1073.54,  408.35,  411.56,
        522.62,  221.87,  399.31,  879.76,  883.96,  195.54,  575.22,
       1026.4 ,  518.06, 1029.84,  565.71, 1241.07,  233.39,  210.05,
        201.76,  316.57,  189.91,  217.12,  241.26, 1065.  ,  643.98,
        465.75,  204.9 ,  502.47, 1254.06, 1145.48,  414.43, 1204.81,
        693.84, 1070

In [29]:
submission['num'] = pred

In [30]:
submission

Unnamed: 0,code,num
0,C1072,619.230
1,C1128,1171.640
2,C1456,373.040
3,C1840,382.260
4,C1332,980.150
...,...,...
145,C2456,177.210
146,C1266,387.340
147,C2152,56.420
148,C1267,139.130


In [31]:
score111 = pd.read_csv('./신분OneHot.csv')

In [33]:
submission['Score111'] = score111['num']

In [38]:
pd.set_option('max_rows', 150)
submission['minus'] = submission['Score111'] - submission['num'] 
submission

Unnamed: 0,code,num,Score111,minus
0,C1072,619.23,775.55,156.32
1,C1128,1171.64,1325.31,153.67
2,C1456,373.04,569.19,196.15
3,C1840,382.26,530.39,148.13
4,C1332,980.15,1292.72,312.57
5,C1563,1873.27,1775.01,-98.26
6,C1794,1020.15,914.26,-105.89
7,C1640,154.21,539.61,385.4
8,C1377,155.05,364.64,209.59
9,C2072,195.57,290.66,95.09


In [40]:
submission['minus'].sum() / 150

107.97933333333334

In [None]:
# submission.to_csv('baseline.csv', index=False)