In [69]:
from numpy import sin, cos, arccos, pi, round

def rad2deg(radians):
    degrees = radians * 180 / pi
    return degrees

def deg2rad(degrees):
    radians = degrees * pi / 180
    return radians

def getDistanceBetweenPointsNew(latitude1, longitude1, latitude2, longitude2, unit = 'miles'):
    
    theta = longitude1 - longitude2
    
    distance = 60 * 1.1515 * rad2deg(
        arccos(
            (sin(deg2rad(latitude1)) * sin(deg2rad(latitude2))) + 
            (cos(deg2rad(latitude1)) * cos(deg2rad(latitude2)) * cos(deg2rad(theta)))
        )
    )
    
    if unit == 'miles':
        return round(distance, 2)
    if unit == 'km':
        return round(distance * 1.609344, 2)

In [70]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [214]:
# 실거래 데이터 불러오기

data_df=pd.read_csv('./data/전처리 완료 데이터/전세_실거래_데이터_완성본.csv')
data_df=data_df[data_df['bath_cnt']!='-']
data_df.reset_index(drop=True, inplace=True)
data_df=data_df.astype({'room_cnt' : 'int', 'bath_cnt' : 'int'})
data_df.columns

Index(['주소', '단지명', '전용면적(㎡)', '계약년월', '계약일', '보증금(만원)', '층', '건축년도', '도로명',
       'latitude', 'longitude', 'home_cnt', 'home_size', 'room_cnt',
       'bath_cnt', 'edu', 'bus', 'sub_dist', '1호선', '2호선', '3호선', '4호선', '5호선',
       '6호선', '7호선', '8호선', '9호선'],
      dtype='object')

In [145]:
# 지하철역 데이터 가공
subway_df=pd.read_csv('./data/전처리 완료 데이터/서울_지하철역_위치정보.csv')
subway_df.drop('Unnamed: 0', axis=1, inplace=True)

station=[]
dis_value=[]

for i in tqdm(data_df.index) :
    lat = data_df.loc[i, ['latitude','longitude']].astype('float64')['latitude'] 
    long = data_df.loc[i, ['latitude','longitude']].astype('float64')['longitude']
    distance=getDistanceBetweenPointsNew( lat, long, 
                                subway_df[['latitude','longitude']].astype('float64')['latitude'], 
                                subway_df[['latitude','longitude']].astype('float64')['longitude'], unit= 'km')
    station.append( list (set( list( subway_df.loc[distance.loc[(distance>=distance.min())&(distance<=distance.min()+0.5)].index, 'line'] ) ) ) )
    dis_value.append( distance.loc[(distance>=distance.min())&(distance<=distance.min()+0.5)].min() )
    
station_index=[]
for data in station:
    lst=[0 for i in range(9)]
    for i in data:
        lst[i-1]=1
    station_index.append(lst)   

station_data=pd.DataFrame(station_index)
station_data.columns=[str(i)+'호선' for i in range(1,10)]
station_data.to_csv('./data/전처리 완료 데이터/station_info.csv', index=False)

pd.DataFrame(dis_value, columns=['sub_dist']).to_csv('./data/전처리 완료 데이터/station_distance.csv', index=False)

  data_df=pd.read_csv('./data/전세_실거래_데이터_완성본.csv', encoding='cp949')
100%|█████████████████████████████████████████████████████████████████████████| 578955/578955 [57:10<00:00, 168.78it/s]


In [137]:
# 버스 데이터 가공(아파트 데이터는 위의 data_df 사용)

bus_df=pd.read_csv("./data/전처리 완료 데이터/서울시 버스 위치정보_전처리.csv")
bus_df.drop(['Unnamed: 0'], axis=1, inplace=True)
bus_station=[]

for i in tqdm(data_df.index) :
    lat = data_df.loc[i, ['latitude','longitude']].astype('float64')['latitude'] 
    long = data_df.loc[i, ['latitude','longitude']].astype('float64')['longitude']
    distance=getDistanceBetweenPointsNew( lat, long, 
                                bus_df[['latitude','longitude']].astype('float64')['latitude'], 
                                bus_df[['latitude','longitude']].astype('float64')['longitude'], unit= 'km')
    if distance.min()<=0.1:
        bus_station.append(1)
    else:
        bus_station.append(0)    

pd.DataFrame(bus_station, columns=['bus']).to_csv('./data/전처리 완료 데이터/bus_station.csv', index=False) 

  0%|                                                                            | 469/578955 [00:02<58:36, 164.51it/s]


KeyboardInterrupt: 

In [215]:
# 공원 데이터 가공(아파트 데이터는 위의 data_df 사용)
park_df=pd.read_csv("./data/전처리 완료 데이터/서울시 주요 공원현황_전처리.csv", encoding='cp949')
park_df.drop(['Unnamed: 0'], axis=1, inplace=True)

park=[]

for i in tqdm(data_df.index) :
    lat = data_df.loc[i, ['latitude','longitude']].astype('float64')['latitude'] 
    long = data_df.loc[i, ['latitude','longitude']].astype('float64')['longitude']
    distance=getDistanceBetweenPointsNew( lat, long, 
                                park_df[['위도','경도']].astype('float64')['위도'], 
                                park_df[['위도','경도']].astype('float64')['경도'], unit= 'km')
    if distance.min()<=0.5:
        if park_df.loc[ list(distance).index(min(list(distance))), '한강공원' ]==1:
            park.append([1,1])
        else:
            park.append([1,0])
        
    else:
        park.append([0,0])    

pd.DataFrame(park, columns=['park', 'han_park']).to_csv('./data/전처리 완료 데이터/park_info.csv', index=False) 

100%|█████████████████████████████████████████████████████████████████████████| 521282/521282 [42:20<00:00, 205.18it/s]


In [216]:
### 군집 데이터 가공(아파트 데이터는 위의 data_df 사용)

from sklearn.cluster import KMeans

# 군집 생성
df=pd.read_csv("C:/Users/jhgf/Desktop/multicampus/multicampus/조별과제 1차/프로젝트 파일/스타벅스 데이터를 이용한 아파트 가격 예측/data/coffee_data/상가업소_202112_01.csv", encoding='cp949' )
df.dropna(subset='상호명', inplace=True)
columns = ['상호명', '상권업종대분류명', '상권업종중분류명', '상권업종소분류명', 
       '시도명', '시군구명', '행정동명', '법정동명', '도로명주소', 
       '경도', '위도']
df_tmp=df[columns].copy()
df_seoul=df_tmp[df_tmp['시도명'] =='서울특별시'].copy()
df_seoul["상호명_소문자"] = df_seoul['상호명'].str.lower()
df_cafe=df_seoul[ df_seoul['상호명_소문자'].str.contains('스타벅스|starbucks') ]
df_cafe=df_cafe[ df_cafe['상권업종중분류명']=='커피점/카페']
coffee=[]
for data in df_cafe['상호명_소문자']:
    if '스타벅스' in data or 'starbucks' in data:
        coffee.append('스타벅스')
    else:
        coffee.append('이디야')
df_cafe['브랜드명']=coffee
df_star=df_cafe[df_cafe['브랜드명']=='스타벅스']
df_star.reset_index(drop=True, inplace=True)

df_loc=df_star[['위도', '경도']]

cluster = KMeans(n_clusters = 5, max_iter=500, random_state=0)
cluster_labels = cluster.fit_predict(df_loc)
df_loc['cluster']=cluster_labels
center=cluster.cluster_centers_

# 군집 중심을 기준으로 nkm 내에 있는 스타벅스 갯수 구하는 코드(가중치 역할)
num_cafe=[]
for i in range(5):
    num=0
    lat = center[i][0]
    long = center[i][1]
    distance=getDistanceBetweenPointsNew( lat, long, 
    df_loc[df_loc['cluster']==i]['위도'],
    df_loc[df_loc['cluster']==i]['경도'], unit= 'km')
    for data in distance:
        if data<=1.5:  # 여기에 반경 몇km 이내의 스타벅스 갯수를 확인할건지 입력
            num+=1
    num_cafe.append(num)
    
# 가장 가까운 군집까지의 거리 가공치 산출
center_df=pd.DataFrame(center, columns=['latitude','longitude'])    
center_df
center_weight=[]
for i in tqdm(data_df.index) :
    lat = data_df.loc[i, ['latitude','longitude']].astype('float64')['latitude'] 
    long = data_df.loc[i, ['latitude','longitude']].astype('float64')['longitude']
    distance=getDistanceBetweenPointsNew( lat, long, 
                                         center_df[['latitude','longitude']].astype('float64')['latitude'], 
                                         center_df[['latitude','longitude']].astype('float64')['longitude'], unit= 'km')
    center_weight.append( num_cafe[list(distance).index(min(list(distance)))] / min(list(distance)) )

pd.DataFrame(center_weight, columns=['center_weight']).to_csv('./data/전처리 완료 데이터/center_info.csv', index=False) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_loc['cluster']=cluster_labels
100%|█████████████████████████████████████████████████████████████████████████| 521282/521282 [33:24<00:00, 260.11it/s]
