In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, RobustScaler
pd.options.display.float_format = '{:.3f}'.format

# 데이터 불러오기

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
car_2020 = pd.read_csv('data/car_2020.csv',encoding='cp949')
age = pd.read_csv('data/age_gender_info.csv')
submission = pd.read_csv('data/sample_submission.csv')

# 필요한 컬럼만 가져오기

In [4]:
train = train[ ['단지코드', '총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '단지내주차면수', '등록차량수'] ]

# 결측치 처리

In [5]:
train.loc[train.임대보증금=='-', '임대보증금'] = np.nan
train.loc[train.임대료=='-', '임대료'] = np.nan
train['임대보증금'] = train['임대보증금'].astype(float)
train['임대료'] = train['임대료'].astype(float)
train['임대보증금'] = train['임대보증금'].fillna(0)
train['임대료'] = train['임대료'].fillna(0)

In [6]:
pd.DataFrame(train.corr()['등록차량수'].abs().sort_values(ascending=False))

Unnamed: 0,등록차량수
등록차량수,1.0
단지내주차면수,0.859
임대료,0.52
임대보증금,0.436
총세대수,0.317
전용면적별세대수,0.247
공가수,0.118
전용면적,0.115


# 전용면적을 5의 배수로 변경

In [7]:
train['전용면적'] = np.round(train['전용면적'], 0)
train['전용면적'] = train['전용면적']//5*5

# 전용면적 상/하한 적용

In [8]:
idx = train[train['전용면적']>100].index
train.loc[idx, '전용면적'] = 100

idx = train[train['전용면적']<15].index
train.loc[idx, '전용면적'] = 15

# test 불러오기

In [9]:
test = test[ ['단지코드', '총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '단지내주차면수'] ]

In [10]:
test.loc[test.임대보증금=='-', '임대보증금'] = np.nan
test.loc[test.임대료=='-', '임대료'] = np.nan
test['임대보증금'] = test['임대보증금'].fillna(0)
test['임대료'] = test['임대료'].fillna(0)
test['임대보증금'] = test['임대보증금'].astype(float)
test['임대료'] = test['임대료'].astype(float)

In [11]:
test['전용면적'] = np.round(test['전용면적'], 0)
test['전용면적'] = test['전용면적']//5*5

idx = test[test['전용면적']>100].index
test.loc[idx, '전용면적'] = 100

idx = test[test['전용면적']<15].index
test.loc[idx, '전용면적'] = 15

In [12]:
columns = ['총세대수', '전용면적', '전용면적별세대수', '공가수', '임대보증금', '임대료', '단지내주차면수']
target = '등록차량수'
area_columns = []
for area in train['전용면적'].unique():
    area_columns.append(f'면적_{area}')

In [13]:
new_train = pd.DataFrame()
new_test = pd.DataFrame()

new_train['임대보증금'] = 0
new_train['임대료'] = 0

# 전용면적 / 최빈값
# 전용면적별세대수, 임대보증금, 임대료 / 평균값

In [14]:
# Train Dataframe Set
# 미사용 열s - 임대건물구분, 공급유형, 자격유형 임대보증금, 임대료
for i, code in tqdm(enumerate(train['단지코드'].unique())):
    temp = train[train['단지코드']==code]
    temp.index = range(temp.shape[0]) # index 재설정
    for col in columns:
        new_train.loc[i, col] = temp.loc[0, col] # Fixed Data
    
    deposit_cost = []
    rental_cost = []

    for col in area_columns:
        area = float(col.split('_')[-1]) # 면적의 숫자를 float형으로 저장
        # '면적_10' 등의 열을 만들고 - 같은 전용면적들 끼리 전용면적별세대수의 합을 Value로 Cell에 입력
        new_train.loc[i, col] = temp[temp['전용면적']==area]['전용면적별세대수'].sum()
        
        try:
          deposit_cost.append(temp[temp['전용면적']==area]['임대보증금'][0])
          rental_cost.append(temp[temp['전용면적']==area]['임대료'][0])
        except KeyError as e:
          continue
    
    new_train.loc[i, '등록차량수'] = temp.loc[0, '등록차량수']
    
    new_train.loc[i, '임대보증금'] = np.mean(float(np.array(deposit_cost)))
    new_train.loc[i, '임대료'] = np.mean(float(np.array(rental_cost)))

# Test DataFrame Set
# 미사용 열s - 임대건물구분, 공급유형, 자격유형 임대보증금, 임대료, 등록차량수
for i, code in tqdm(enumerate(test['단지코드'].unique())):
    temp = test[test['단지코드']==code]
    temp.index = range(temp.shape[0])
    for col in columns:
        new_test.loc[i, col] = temp.loc[0, col]

    deposit_cost = []
    rental_cost = []
    
    for col in area_columns:
        area = float(col.split('_')[-1])
        new_test.loc[i, col] = temp[temp['전용면적']==area]['전용면적별세대수'].sum()
        
        try:
          deposit_cost.append(temp[temp['전용면적']==area]['임대보증금'][0])
          rental_cost.append(temp[temp['전용면적']==area]['임대료'][0])
        except KeyError as e:
          continue
          
    new_test.loc[i, '임대보증금'] = np.mean(float(np.array(deposit_cost)))
    new_test.loc[i, '임대료'] = np.mean(float(np.array(rental_cost)))

423it [00:07, 57.86it/s]
150it [00:02, 58.96it/s]


In [15]:
pd.set_option('max_columns', 30)
new_train.head(2)

Unnamed: 0,임대보증금,임대료,총세대수,전용면적,전용면적별세대수,공가수,단지내주차면수,면적_40.0,면적_50.0,면적_60.0,면적_30.0,면적_45.0,면적_35.0,면적_25.0,면적_70.0,면적_15.0,면적_20.0,면적_55.0,면적_100.0,면적_75.0,면적_80.0,면적_85.0,면적_65.0,등록차량수
0,15667000.0,103680.0,900.0,40.0,134.0,38.0,1425.0,149.0,665.0,86.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1015.0
1,9216000.0,82940.0,545.0,30.0,276.0,17.0,624.0,80.0,132.0,0.0,276.0,57.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,205.0


# Training / Test Data Split

In [16]:
new_train_copy = new_train.copy()
new_test_copy = new_test.copy()

new_train_copy = new_train_copy[new_train_copy['임대보증금'] != 0 ]
new_train_copy = new_train_copy[new_train_copy['임대료'] != 0 ]

# del new_train_copy['전용면적별세대수']
# del new_test_copy['전용면적별세대수']
del new_train_copy['전용면적']
del new_test_copy['전용면적']

x_train = new_train_copy.iloc[:, :-1]
y_train = new_train_copy.iloc[:,-1]
x_test = new_test_copy

# Training Data Scaling

In [17]:
x_train.columns

Index(['임대보증금', '임대료', '총세대수', '전용면적별세대수', '공가수', '단지내주차면수', '면적_40.0',
       '면적_50.0', '면적_60.0', '면적_30.0', '면적_45.0', '면적_35.0', '면적_25.0',
       '면적_70.0', '면적_15.0', '면적_20.0', '면적_55.0', '면적_100.0', '면적_75.0',
       '면적_80.0', '면적_85.0', '면적_65.0'],
      dtype='object')

In [18]:
cols = x_train.columns

# log scale을 해주는 이유
# - 너무 큰 값을 log scale해주면 minmaxscaler를 쓸 때 4분위수가 잘 나눠지기 때문임
# log를 할 때 0인 값이 있으면 ValueError가 발생함

x_train['총세대수'] = np.where(x_train['총세대수'] == 0, 0, np.log(x_train['총세대수']))
x_train['임대료'] = np.where(x_train['임대료'] == 0, 0, np.log(x_train['임대료']))
x_train['임대보증금'] = np.where(x_train['임대보증금'] == 0, 0, np.log(x_train['임대보증금']))
# x_train['면적_15.0'] = np.where(x_train['면적_15.0'] == 0, 0, np.log(x_train['면적_15.0']))
# x_train['면적_25.0'] = np.where(x_train['면적_25.0'] == 0, 0, np.log(x_train['면적_25.0']))
# x_train['면적_30.0'] = np.where(x_train['면적_30.0'] == 0, 0, np.log(x_train['면적_30.0']))
# x_train['면적_55.0'] = np.where(x_train['면적_55.0'] == 0, 0, np.log(x_train['면적_55.0']))
# x_train['면적_65.0'] = np.where(x_train['면적_65.0'] == 0, 0, np.log(x_train['면적_65.0']))
# x_train['면적_75.0'] = np.where(x_train['면적_75.0'] == 0, 0, np.log(x_train['면적_75.0']))
# x_train['면적_85.0'] = np.where(x_train['면적_85.0'] == 0, 0, np.log(x_train['면적_85.0']))

# scaler = MinMaxScaler()
# scaler = StandardScaler()
# scaler = RobustScaler()
# scaler.fit(x_train)

# scaler.transform의 DataType이 numpy.array type임 
# x_train_scaled = pd.DataFrame(data=scaler.transform(x_train), columns=cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


### 총세대수, 면적 15/25/30/55/65/75/85

In [19]:
# x_train_scaled = x_train_scaled[ ['총세대수', '공가수', '임대보증금', '임대료',	'단지내주차면수'] ]
# x_test = x_test[ ['총세대수', '공가수', '임대보증금', '임대료',	'단지내주차면수'] ]

# Create Model & Fitting

In [20]:
from sklearn.linear_model import LinearRegression
import xgboost

# xgb = xgboost.XGBRegressor()
# xgb.fit(x_train_scaled, y_train)
# forest = LinearRegression()
forest = RandomForestRegressor(n_jobs=-1, random_state=42)
forest.fit(x_train, y_train)

RandomForestRegressor(n_jobs=-1, random_state=42)

# Predict

In [21]:
pred = forest.predict(x_test)

In [22]:
submission['num'] = pred

In [23]:
pred

array([ 825.35, 1293.67,  610.31,  609.96, 1249.13, 1540.1 , 1187.79,
        532.57,  410.85,  212.34,  498.11,  406.34,  449.3 ,  210.64,
        389.69,  354.78,  473.51,  199.09,  167.58,  996.89,  235.75,
        481.73,  579.2 ,  624.39,  497.57,  139.16,  138.87,  693.54,
        638.86,  568.93, 1188.62,  139.07,  500.93,  252.47,  104.96,
        254.74,  481.74,  638.66,  972.73,  319.43,  476.03,  500.37,
        482.17,  514.35,  862.41, 1430.42,  482.19,  619.05,  395.15,
        404.78,  916.01,  211.93, 1318.38,  762.63,  608.68,  286.26,
        629.  ,  255.81,  494.94,  109.58,  391.26,  524.13,  898.89,
        444.56,   77.41,  210.43,  634.68, 1165.75,  626.99,  580.54,
        907.38,  254.41,  578.9 ,  885.13, 1094.9 ,  447.37,  782.64,
       1123.01,  894.75,  995.61,  940.71, 1311.87,  292.54,  220.03,
        330.61,  205.1 ,  232.43,  418.4 ,  203.62, 1163.35,  994.99,
        789.43,  216.9 ,  794.02, 1304.28, 1293.25,  618.21, 1253.48,
       1033.39, 1081

In [24]:
submission

Unnamed: 0,code,num
0,C1072,825.350
1,C1128,1293.670
2,C1456,610.310
3,C1840,609.960
4,C1332,1249.130
...,...,...
145,C2456,202.030
146,C1266,611.390
147,C2152,45.160
148,C1267,474.820


In [25]:
submission.to_csv('./scale.csv', index=False)