#### 모듈 로드

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

#### 트레인, 테스트, 지하철 학교 데이터 로드

In [3]:
train_raw = pd.read_csv('./data/train.csv')
test_raw = pd.read_csv('./data/test.csv')

#### 트레인 + 테스트 합치기

In [4]:
data = pd.concat([train_raw, test_raw]).sort_values('key').set_index('key')

#### 널 데이터 처리

In [6]:
# 양적정보
data.total_parking_capacity_in_site.fillna(data.total_parking_capacity_in_site.median(), inplace = True)
data.tallest_building_in_sites.fillna(data.tallest_building_in_sites.median(), inplace = True)
data.lowest_building_in_sites.fillna(data.lowest_building_in_sites.median(), inplace = True)
data.room_count.fillna(data.room_count.median(), inplace = True)
data.bathroom_count.fillna(data.bathroom_count.median(), inplace = True)

In [7]:
# 질적정보
data.heat_type.fillna('-', inplace = True)
data.heat_fuel.fillna('-', inplace = True)
data.front_door_structure.fillna('-', inplace = True)

#### Feature Engineering

* 거래기간 (날)

In [9]:
# 거래 기간 (날)
data['transaction_date_duration'] = data.transaction_date.apply(lambda v: int(v.split('~')[-1]) - int(v.split('~')[0]))

* 거래일자

In [10]:
# transaction date
data['transaction_date'] = (data.transaction_year_month) * 100 + data.transaction_date.apply(lambda v: v.split('~')[-1]).astype("int")
data.drop(['transaction_year_month'], axis = 1, inplace=True)

* 주소정보

In [11]:
def get_addr(data):
    return law_addr.loc[str(data.address_by_law)].addr

In [12]:
def get_gu(addr):
    temp = addr.split(" ")
    if temp[1] == "강서구":
        if temp[0] == "서울특별시":
            return "서울강서구"
        else:
            return "부산강서구"
    else:
        return temp[1]

In [13]:
# 법정 주소 정보 읽기 - 법정 주소 홈페이지
with open('./data/law_addr.txt') as f:
    law_addr = f.read()
    
law_addr = [v.split('\t') for v in law_addr.split('\n')]
law_addr = pd.DataFrame(law_addr[1:])
law_addr.columns = ['law_addr', 'addr', 'existance']
law_addr = law_addr.set_index('law_addr').dropna()

In [14]:
# 주소와 구(군) 얻기
data['addr'] = data.apply(get_addr, axis = 1)
data['gu'] = data.addr.apply(get_gu)

In [15]:
data.drop(['addr'],axis = 1, inplace=True)

#### FE - 지하철/학교 Common

In [22]:
subway = pd.read_csv('./data/Subways.csv')
school = pd.read_csv('./data/Schools.csv')

In [18]:
def deg2rad(data):
    return (data * np.pi) / 180

In [19]:
def rad2deg(data):
    return data * 180 / np.pi

In [20]:
loc = pd.DataFrame([list(v) for v in list(set([tuple(v) for v in data[['apartment_id', 'latitude', 'longitude']].values]))], columns=['apartment_id', 'latitude', 'longitude'])

#### FE - 지하철

In [23]:
dist_subway = []

for i in range(len(loc)):

    lat1 = loc.iloc[i].latitude
    lon1 = loc.iloc[i].longitude
    
    lat2 = subway.latitude
    lon2 = subway.longitude
    
    dist_subway.append(list((rad2deg(np.arccos(np.sin(deg2rad(lat1)) * np.sin(deg2rad(lat2)) + np.cos(deg2rad(lat1)) * np.cos(deg2rad(lat2)) * np.cos(deg2rad(lon1 - lon2)))) * 60 * 1.1515 * 1.609344 * 1000).values))

In [24]:
# 각 단지별 지하철 역 사이에 거리
df_subway = pd.concat([loc, pd.DataFrame(dist_subway)], axis = 1).drop(['latitude', 'longitude'], axis = 1)

In [28]:
dist_dic = {}
dist_400 = {}
dist_800 = {}
dist_1200 = {}
dist_1600 = {}
dist_2000 = {}

for idx in range(3918):
    dist_dic.update({df_subway.T.iloc[0,idx]:df_subway.T.iloc[1:,idx].sort_values().to_frame().reset_index().iloc[0].values})
    dist_400.update({df_subway.T.iloc[0,idx]: np.sum((df_subway.T.iloc[1:,idx].sort_values().to_frame().reset_index()[idx] < 400) * 1)})
    dist_800.update({df_subway.T.iloc[0,idx]: np.sum((df_subway.T.iloc[1:,idx].sort_values().to_frame().reset_index()[idx] < 800) * 1)})
    dist_1200.update({df_subway.T.iloc[0,idx]: np.sum((df_subway.T.iloc[1:,idx].sort_values().to_frame().reset_index()[idx] < 1200) * 1)})
    dist_1600.update({df_subway.T.iloc[0,idx]: np.sum((df_subway.T.iloc[1:,idx].sort_values().to_frame().reset_index()[idx] < 1600) * 1)})
    dist_2000.update({df_subway.T.iloc[0,idx]: np.sum((df_subway.T.iloc[1:,idx].sort_values().to_frame().reset_index()[idx] < 2000) * 1)})

In [31]:
temp = data.apartment_id.apply(lambda v: np.append(np.append(np.append(np.append(np.append(dist_dic[float(v)], dist_400[float(v)]),  dist_800[float(v)]), dist_1200[float(v)]), dist_1600[float(v)]), dist_2000[float(v)]))
temp = np.array([list(v) for v in temp.values])

In [44]:
data['subway_idx'] = temp[:, 0]
data['subway_distance'] = temp[:, 1]
data['subway_distance_400'] = temp[:, 2]
data['subway_distance_800'] = temp[:, 3]
data['subway_distance_1200'] = temp[:, 4]
data['subway_distance_1600'] = temp[:, 5]
data['subway_distance_2000'] = temp[:, 6]

#### FE - 학교

In [50]:
dist_school = []

for i in range(len(loc)):

    lat1 = loc.iloc[i].latitude
    lon1 = loc.iloc[i].longitude
    
    lat2 = school.latitude
    lon2 = school.longitude
    
    dist_school.append(list((rad2deg(np.arccos(np.sin(deg2rad(lat1)) * np.sin(deg2rad(lat2)) + np.cos(deg2rad(lat1)) * np.cos(deg2rad(lat2)) * np.cos(deg2rad(lon1 - lon2)))) * 60 * 1.1515 * 1.609344 * 1000).values))

In [51]:
# 각 단지별 학교 사이에 거리
df_school = pd.concat([loc, pd.DataFrame(dist_school)], axis = 1).drop(['latitude', 'longitude'], axis = 1)

In [52]:
dist_dic = {}
dist_400 = {}
dist_800 = {}
dist_1200 = {}
dist_1600 = {}
dist_2000 = {}

for idx in range(3918):
    dist_dic.update({df_school.T.iloc[0,idx]:df_school.T.iloc[1:,idx].sort_values().to_frame().reset_index().iloc[0].values})
    dist_400.update({df_school.T.iloc[0,idx]: np.sum((df_school.T.iloc[1:,idx].sort_values().to_frame().reset_index()[idx] < 400) * 1)})
    dist_800.update({df_school.T.iloc[0,idx]: np.sum((df_school.T.iloc[1:,idx].sort_values().to_frame().reset_index()[idx] < 800) * 1)})
    dist_1200.update({df_school.T.iloc[0,idx]: np.sum((df_school.T.iloc[1:,idx].sort_values().to_frame().reset_index()[idx] < 1200) * 1)})
    dist_1600.update({df_school.T.iloc[0,idx]: np.sum((df_school.T.iloc[1:,idx].sort_values().to_frame().reset_index()[idx] < 1600) * 1)})
    dist_2000.update({df_school.T.iloc[0,idx]: np.sum((df_school.T.iloc[1:,idx].sort_values().to_frame().reset_index()[idx] < 2000) * 1)})

In [53]:
temp = data.apartment_id.apply(lambda v: np.append(np.append(np.append(np.append(np.append(dist_dic[float(v)], dist_400[float(v)]),  dist_800[float(v)]), dist_1200[float(v)]), dist_1600[float(v)]), dist_2000[float(v)]))
temp = np.array([list(v) for v in temp.values])

In [54]:
data['school_idx'] = temp[:, 0]
data['school_distance'] = temp[:, 1]
data['school_distance_400'] = temp[:, 2]
data['school_distance_800'] = temp[:, 3]
data['school_distance_1200'] = temp[:, 4]
data['school_distance_1600'] = temp[:, 5]
data['school_distance_2000'] = temp[:, 6]

In [56]:
data.to_csv('./data/data.csv')