In [4]:
import pandas as pd
import numpy as np

import json
import folium

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.impute import SimpleImputer
from matplotlib import rcParams
import plotly.express as px
%matplotlib inline

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import warnings

warnings.filterwarnings(action = 'ignore')

In [5]:
tmp_df = pd.DataFrame()

In [6]:
def merge_area_data():
    # 행정동별 데이터 로드
    df = pd.read_csv("행정동_컬럼추가_최종ver.csv", index_col=0)
    df.rename(columns={"인구수": "MZ_POP_CNT"}, inplace=True)

    # 인구 밀도 데이터 로드
    density_df = pd.read_excel('인구밀도.xlsx')
    density_df['GU_DONG'] = density_df['GU'] + density_df['DONG']
    df['GU_DONG'] = df['GU'] + df['DONG']
    density_df.drop(['GU', 'DONG', 'POP', 'DENSITY'], axis=1, inplace=True)

    # 행정동 데이터, 밀도 데이터 병합
    tmp = pd.merge(df, density_df, on='GU_DONG')
    tmp.drop(['GU_DONG'], axis=1, inplace=True)

    # 컬럼 순서 정렬
    tmp = tmp[['GU', 'DONG', 'DONG_CODE', 'AREA', 'ACADEMY_NUM', 'KINDER_NUM', 'FIRE_NUM',
       'ELE_SCH_NUM', 'MID_SCH_NUM', 'HIGH_SCH_NUM', 'CCTV_NUM', 'POLICE_NUM',
       'BIKE_NUM', 'CAR_SHR_NUM', 'SUBWAY_NUM', 'SAFE_DLVR_NUM', 'DPTM_NUM',
       'ANI_HSPT_NUM', 'PHARM_NUM', 'LEISURE_NUM', 'KIDS_NUM', 'SPORT_NUM',
       'GYM_NUM', 'GOLF_NUM', 'STARBUCKS_NUM', 'MC_NUM', 'CON_NUM',
       'NOISE_VIBRATION_NUM', 'CHILD_MED_NUM', 'CAFE_NUM', 'PARK_NUM',
       'HOSPITAL_NUM', 'BUS_CNT', 'RETAIL_NUM', 'COLIVING_NUM', 'MZ_POP_CNT','VEGAN_CNT']]
    # 불필요 컬럼 제거
    tmp = tmp.drop(['SPORT_NUM'], axis=1)
    return tmp

def assembling_features(df):
    global tmp_df
    # 피쳐합
    tmp_df = df.copy()
    # 교통
    tmp_df['교통'] = tmp_df['SUBWAY_NUM'] + 0.93 * tmp_df['BUS_CNT'] + 0.06 * tmp_df['BIKE_NUM']
    tmp_df = tmp_df.drop(['SUBWAY_NUM', 'BUS_CNT', 'BIKE_NUM'], axis=1)

    # 교육
    tmp_df['교육'] = (0.07) * tmp_df['MID_SCH_NUM'] + (0.03) * tmp_df['HIGH_SCH_NUM'] + tmp_df['ACADEMY_NUM'] * (0.7) + (
        0.9) * tmp_df['ELE_SCH_NUM']
    tmp_df = tmp_df.drop(['MID_SCH_NUM', 'HIGH_SCH_NUM', 'ACADEMY_NUM', 'ELE_SCH_NUM'], axis=1)

    # 육아
    tmp_df['육아'] = tmp_df['CHILD_MED_NUM'] + tmp_df['KINDER_NUM']
    tmp_df = tmp_df.drop(['CHILD_MED_NUM', 'KINDER_NUM'], axis=1)

    # 치안
    tmp_df['치안'] = tmp_df['POLICE_NUM'] + tmp_df['CCTV_NUM'] + tmp_df['FIRE_NUM']
    tmp_df = tmp_df.drop(['POLICE_NUM', 'CCTV_NUM', 'FIRE_NUM'], axis=1)

    # 건강
    tmp_df['건강'] = (0.94) * tmp_df['HOSPITAL_NUM'] + tmp_df['PHARM_NUM']
    tmp_df = tmp_df.drop(['HOSPITAL_NUM', 'PHARM_NUM'], axis=1)

    # 편의시설
    tmp_df['편의시설'] = 0.04 * tmp_df['DPTM_NUM'] + 0.44 * tmp_df['CON_NUM'] + 0.25 * tmp_df['CAFE_NUM'] + 0.27 * tmp_df[
        'RETAIL_NUM']
    tmp_df = tmp_df.drop(['DPTM_NUM', 'CON_NUM', 'CAFE_NUM', 'RETAIL_NUM'], axis=1)

    tmp_df.set_index('DONG_CODE', inplace=True)

    return tmp_df

def robust_scaling(df):
    robust_scaler = RobustScaler()

    robust_scaler.fit(df)

    robust_data = robust_scaler.transform(df)
    ro_df = pd.DataFrame(robust_data)
    ro_df.index = df.index
    ro_df.columns = df.columns
    return ro_df

def preprocessing_df():
    area_df = merge_area_data()
    assem_df = assembling_features(area_df)

    tmp_data = assem_df.iloc[:, 3:]
    df = tmp_data.div(assem_df['AREA'], axis=0)

    max_lim_log_list = ["교통","치안","교육","COLIVING_NUM","STARBUCKS_NUM","MC_NUM","NOISE_VIBRATION_NUM","VEGAN_CNT"]

    for f in max_lim_log_list:
        quan = df[f].quantile(0.95)
        df[f] = np.where(df[f] > quan, quan, df[f])
        df[f] = np.log1p(df[f])

    max_lim_list = ["LEISURE_NUM","GOLF_NUM","건강","편의시설"]
    for f in max_lim_list:
        quan = df[f].quantile(0.95)
        df[f] = np.where(df[f] > quan, quan, df[f])
    
    ro_df = robust_scaling(df)
    
    return ro_df


In [7]:
df = preprocessing_df()
df

Unnamed: 0_level_0,CAR_SHR_NUM,SAFE_DLVR_NUM,ANI_HSPT_NUM,LEISURE_NUM,KIDS_NUM,GYM_NUM,GOLF_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,PARK_NUM,COLIVING_NUM,MZ_POP_CNT,VEGAN_CNT,교통,교육,육아,치안,건강,편의시설
DONG_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1111051500,-0.878912,-0.178460,-0.583147,-0.203745,-0.439314,-0.906523,-0.792079,-0.565210,0.000000,1.296060,1.266253,0.000000,-1.093461,-0.310295,-1.251373,-1.689682,-0.803218,-0.558933,-0.989314,-0.944897
1111053000,0.136329,-0.552582,-0.565065,0.049270,-0.439314,1.341133,-0.001489,1.642479,0.000000,1.504113,4.871286,0.594987,-0.907741,0.788295,0.060759,-0.968695,-0.415289,-0.168852,0.603788,0.548133
1111054000,-0.801018,-0.552582,-0.987506,-0.435990,-0.439314,-0.597282,-0.792079,0.020247,0.000000,1.458929,1.319948,0.000000,-1.242268,0.317623,-0.736808,-2.957494,-0.674763,-0.908603,-0.994077,-0.641856
1111055000,-0.378039,-0.552582,-0.300808,-0.435990,-0.439314,-0.771202,-0.506491,-0.747557,0.000000,0.656637,0.772822,0.000000,-1.114231,-1.150577,-1.673661,-1.263754,-0.821676,-0.734128,-1.083540,-0.925136
1111056000,-0.986377,-0.552582,-0.987506,-0.435990,-0.439314,-0.819945,-0.609361,-0.587830,0.000000,-0.461063,-0.272328,0.000000,-1.231116,-1.340511,-1.639508,-2.462022,-0.852172,-2.309248,-1.084751,-1.150089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174065000,1.761738,0.404127,1.080560,1.048760,0.895012,1.385863,1.465626,0.890140,0.000000,0.521795,-0.272328,0.000000,0.957008,0.183004,0.308243,0.765966,-0.009298,0.526017,1.361103,1.593932
1174066000,1.606915,0.350228,0.964050,0.965113,0.819839,1.256714,1.338431,0.832997,0.000000,0.485124,-0.272328,0.000000,0.560270,0.130425,-0.046442,0.705841,-0.058585,0.474366,1.220490,1.434644
1174068500,0.286368,-0.109498,0.928076,0.389176,0.178657,0.367494,0.552288,0.229103,0.000000,0.473358,0.820986,0.000000,-0.071260,-1.134137,-0.052715,0.074774,-0.492045,0.202254,0.529415,0.269312
1174069000,-0.385974,-0.552582,-0.140329,-0.435990,-0.439314,-0.238741,0.264906,-0.747557,1.578649,0.577682,2.306467,0.000000,-1.321984,-1.069485,-1.348040,-0.042176,-0.884147,-0.166469,-0.507202,-0.415119


In [8]:
def first_clustering(df):
    global tmp_df
    basic_pca = PCA(n_components=2, random_state=0)
    basic_pca_transformed = basic_pca.fit_transform(df)

    # density_data = minmax_norm(density_data)
    first_kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, random_state=0)
    first_kmeans.fit(basic_pca_transformed)

    basic_df = tmp_df.copy()
    basic_df['km_cluster'] = first_kmeans.labels_

    basic_df['pca_x'] = basic_pca_transformed[:, 0]
    basic_df['pca_y'] = basic_pca_transformed[:, 1]

    return basic_df, first_kmeans, basic_pca

In [17]:
basic_df, first_kmeans, first_pca = first_clustering(df)
basic_df[basic_df['DONG'] =='방화1동']

Unnamed: 0_level_0,GU,DONG,AREA,CAR_SHR_NUM,SAFE_DLVR_NUM,ANI_HSPT_NUM,LEISURE_NUM,KIDS_NUM,GYM_NUM,GOLF_NUM,...,VEGAN_CNT,교통,교육,육아,치안,건강,편의시설,km_cluster,pca_x,pca_y
DONG_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1150063000,강서구,방화1동,1.48,8.75,0.75,3.5,3.5,2.25,20.75,5.5,...,9.5,26.76,69.415,108.75,63.25,85.565,60.12,1,2.246811,-0.129886


In [295]:
df

Unnamed: 0_level_0,CAR_SHR_NUM,SAFE_DLVR_NUM,ANI_HSPT_NUM,LEISURE_NUM,KIDS_NUM,GYM_NUM,GOLF_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,PARK_NUM,COLIVING_NUM,MZ_POP_CNT,VEGAN_CNT,교통,교육,육아,치안,건강,편의시설
DONG_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1111051500,-0.878912,-0.178460,-0.583147,-0.203745,-0.439314,-0.906523,-0.792079,-0.565210,0.000000,1.296060,1.266253,0.000000,-1.093461,-0.310295,-1.251373,-1.689682,-0.803218,-0.558933,-0.989314,-0.944897
1111053000,0.136329,-0.552582,-0.565065,0.049270,-0.439314,1.341133,-0.001489,1.642479,0.000000,1.504113,4.871286,0.594987,-0.907741,0.788295,0.060759,-0.968695,-0.415289,-0.168852,0.603788,0.548133
1111054000,-0.801018,-0.552582,-0.987506,-0.435990,-0.439314,-0.597282,-0.792079,0.020247,0.000000,1.458929,1.319948,0.000000,-1.242268,0.317623,-0.736808,-2.957494,-0.674763,-0.908603,-0.994077,-0.641856
1111055000,-0.378039,-0.552582,-0.300808,-0.435990,-0.439314,-0.771202,-0.506491,-0.747557,0.000000,0.656637,0.772822,0.000000,-1.114231,-1.150577,-1.673661,-1.263754,-0.821676,-0.734128,-1.083540,-0.925136
1111056000,-0.986377,-0.552582,-0.987506,-0.435990,-0.439314,-0.819945,-0.609361,-0.587830,0.000000,-0.461063,-0.272328,0.000000,-1.231116,-1.340511,-1.639508,-2.462022,-0.852172,-2.309248,-1.084751,-1.150089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174065000,1.761738,0.404127,1.080560,1.048760,0.895012,1.385863,1.465626,0.890140,0.000000,0.521795,-0.272328,0.000000,0.957008,0.183004,0.308243,0.765966,-0.009298,0.526017,1.361103,1.593932
1174066000,1.606915,0.350228,0.964050,0.965113,0.819839,1.256714,1.338431,0.832997,0.000000,0.485124,-0.272328,0.000000,0.560270,0.130425,-0.046442,0.705841,-0.058585,0.474366,1.220490,1.434644
1174068500,0.286368,-0.109498,0.928076,0.389176,0.178657,0.367494,0.552288,0.229103,0.000000,0.473358,0.820986,0.000000,-0.071260,-1.134137,-0.052715,0.074774,-0.492045,0.202254,0.529415,0.269312
1174069000,-0.385974,-0.552582,-0.140329,-0.435990,-0.439314,-0.238741,0.264906,-0.747557,1.578649,0.577682,2.306467,0.000000,-1.321984,-1.069485,-1.348040,-0.042176,-0.884147,-0.166469,-0.507202,-0.415119


In [296]:
def second_clustering(basic_df, df, user_first):
    cluster_num = [3,3,2,0]
    second_cluster = basic_df[basic_df['km_cluster'] == user_first]
    cluster_data = df.loc[second_cluster.index.values]
    second_pca = PCA(n_components=2)
    second_pca_transformed = second_pca.fit_transform(cluster_data)
    second_kmeans = KMeans(n_clusters=cluster_num[user_first], init='k-means++', max_iter=400, random_state=0)
    second_kmeans.fit(second_pca_transformed)

    cluster_tmp = second_cluster.copy()
    cluster_tmp['km_cluster'] = second_kmeans.labels_
    return second_kmeans, second_pca, cluster_tmp



In [297]:
cluster_temp

Unnamed: 0_level_0,GU,DONG,AREA,CAR_SHR_NUM,SAFE_DLVR_NUM,ANI_HSPT_NUM,LEISURE_NUM,KIDS_NUM,GYM_NUM,GOLF_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,PARK_NUM,COLIVING_NUM,MZ_POP_CNT,VEGAN_CNT,교통,교육,육아,치안,건강,편의시설,km_cluster,pca_x,pca_y
DONG_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
1111051500,종로구,청운효자동,2.57,1.000000,1.000000,2.000000,1.000000,0.000000,0.000000,0.000000,0.333333,0.0,18816.000000,1.666667,0.0,3210,5.000000,13.693333,14.630000,7.333333,59.666667,15.300000,11.676667,1,-3.367641,1.354040
1111054000,종로구,삼청동,1.49,1.000000,0.000000,0.000000,0.000000,0.000000,3.000000,0.000000,1.000000,0.0,14112.000000,1.000000,0.0,684,6.000000,11.650000,1.500000,11.000000,23.000000,8.580000,13.880000,1,-3.411508,1.714575
1111055000,종로구,부암동,2.27,5.000000,0.000000,3.000000,0.000000,0.000000,2.000000,2.000000,0.000000,0.0,6048.000000,1.000000,0.0,2585,1.000000,8.670000,20.500000,5.000000,43.000000,4.760000,11.020000,1,-3.459018,0.320674
1111056000,종로구,평창동,8.87,0.000000,0.000000,0.000000,0.000000,0.000000,5.000000,5.000000,1.000000,0.0,4032.000000,0.000000,0.0,4597,2.000000,34.830000,19.630000,10.000000,22.000000,18.160000,11.640000,1,-4.798851,-0.890202
1111057000,종로구,무악동,0.36,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,2016.000000,0.000000,0.0,1822,1.000000,4.830000,15.800000,13.000000,13.000000,9.700000,3.380000,0,-1.699955,0.189483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174056000,강동구,고덕2동,2.01,1.000000,0.000000,2.000000,0.000000,0.500000,3.000000,0.500000,0.500000,0.0,2163.500000,0.500000,0.0,6015,1.500000,11.180000,42.250000,43.500000,46.000000,28.590000,12.180000,0,-2.885983,-0.151919
1174058000,강동구,암사2동,1.18,1.333333,0.666667,1.666667,0.333333,1.333333,5.333333,2.666667,0.333333,0.0,1442.333333,0.000000,0.0,4706,0.666667,10.783333,47.823333,13.666667,45.000000,35.533333,16.053333,0,-1.481691,-0.733892
1174059000,강동구,암사3동,2.51,1.333333,0.666667,1.666667,0.333333,1.333333,5.333333,2.666667,0.333333,0.0,1442.333333,0.000000,0.0,4265,0.666667,13.573333,47.823333,13.666667,45.000000,35.533333,16.053333,1,-3.110396,-0.882713
1174069000,강동구,둔촌1동,0.92,2.000000,0.000000,1.500000,0.000000,0.000000,4.000000,3.000000,0.000000,0.5,2163.500000,1.000000,0.0,33,0.500000,4.550000,28.535000,0.000000,33.500000,23.630000,11.855000,0,-1.985705,1.027566


In [298]:
user = [5,1,2,3,4,3,0,0,0,0,0,1,0,0,0,0,0,1,1,1]

In [299]:
df

Unnamed: 0_level_0,CAR_SHR_NUM,SAFE_DLVR_NUM,ANI_HSPT_NUM,LEISURE_NUM,KIDS_NUM,GYM_NUM,GOLF_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,PARK_NUM,COLIVING_NUM,MZ_POP_CNT,VEGAN_CNT,교통,교육,육아,치안,건강,편의시설
DONG_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1111051500,-0.878912,-0.178460,-0.583147,-0.203745,-0.439314,-0.906523,-0.792079,-0.565210,0.000000,1.296060,1.266253,0.000000,-1.093461,-0.310295,-1.251373,-1.689682,-0.803218,-0.558933,-0.989314,-0.944897
1111053000,0.136329,-0.552582,-0.565065,0.049270,-0.439314,1.341133,-0.001489,1.642479,0.000000,1.504113,4.871286,0.594987,-0.907741,0.788295,0.060759,-0.968695,-0.415289,-0.168852,0.603788,0.548133
1111054000,-0.801018,-0.552582,-0.987506,-0.435990,-0.439314,-0.597282,-0.792079,0.020247,0.000000,1.458929,1.319948,0.000000,-1.242268,0.317623,-0.736808,-2.957494,-0.674763,-0.908603,-0.994077,-0.641856
1111055000,-0.378039,-0.552582,-0.300808,-0.435990,-0.439314,-0.771202,-0.506491,-0.747557,0.000000,0.656637,0.772822,0.000000,-1.114231,-1.150577,-1.673661,-1.263754,-0.821676,-0.734128,-1.083540,-0.925136
1111056000,-0.986377,-0.552582,-0.987506,-0.435990,-0.439314,-0.819945,-0.609361,-0.587830,0.000000,-0.461063,-0.272328,0.000000,-1.231116,-1.340511,-1.639508,-2.462022,-0.852172,-2.309248,-1.084751,-1.150089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174065000,1.761738,0.404127,1.080560,1.048760,0.895012,1.385863,1.465626,0.890140,0.000000,0.521795,-0.272328,0.000000,0.957008,0.183004,0.308243,0.765966,-0.009298,0.526017,1.361103,1.593932
1174066000,1.606915,0.350228,0.964050,0.965113,0.819839,1.256714,1.338431,0.832997,0.000000,0.485124,-0.272328,0.000000,0.560270,0.130425,-0.046442,0.705841,-0.058585,0.474366,1.220490,1.434644
1174068500,0.286368,-0.109498,0.928076,0.389176,0.178657,0.367494,0.552288,0.229103,0.000000,0.473358,0.820986,0.000000,-0.071260,-1.134137,-0.052715,0.074774,-0.492045,0.202254,0.529415,0.269312
1174069000,-0.385974,-0.552582,-0.140329,-0.435990,-0.439314,-0.238741,0.264906,-0.747557,1.578649,0.577682,2.306467,0.000000,-1.321984,-1.069485,-1.348040,-0.042176,-0.884147,-0.166469,-0.507202,-0.415119


In [300]:
df = df[['교통', '치안', '건강', '편의시설', '교육',
       '육아', 'MZ_POP_CNT', 'COLIVING_NUM', 'VEGAN_CNT', 'KIDS_NUM',
       'PARK_NUM', 'STARBUCKS_NUM', 'MC_NUM', 'NOISE_VIBRATION_NUM',
       'SAFE_DLVR_NUM', 'LEISURE_NUM', 'GYM_NUM', 'GOLF_NUM', 'CAR_SHR_NUM',
       'ANI_HSPT_NUM']]

In [301]:

first_category = []
for column in df.columns[:6]:
    category = []
    for i in range(0,81,20):
        x = (df[column].quantile(i/100) + df[column].quantile((i+20)/100)) / 2
        category.append(x)
    first_category.append(category)
first_category

[[-1.718632469492249,
  -0.4328277510802305,
  0.01293159388975669,
  0.4168685247772416,
  0.9022638805351546],
 [-1.7229387136628331,
  -0.42488272102363883,
  -0.006312193445273076,
  0.35008449943306413,
  0.7983171066067042],
 [-0.8417683470291442,
  -0.37704924253823296,
  -0.014259655183953224,
  0.3930973062071218,
  1.2059042900124968],
 [-0.8801177057065239,
  -0.38568275201816216,
  -0.0140423642940012,
  0.4061777156208397,
  1.2848641366880977],
 [-1.8329860934110165,
  -0.43954650945152773,
  0.01693954870810549,
  0.4080998068896891,
  0.9182636735162564],
 [-0.7127479284094058,
  -0.35127073041327217,
  0.004573438389642914,
  0.4485786612821247,
  2.2995670540552]]

In [302]:
second_category = []
for column in df.columns[6:]:
    cate = [df[column].min(),df[column].mean()]
    second_category.append(cate)
second_category

[[-1.3219843906227973, 0.010441220695307075],
 [0.0, 0.07838032367571732],
 [-1.5793775824269287, -0.035376257037694134],
 [-0.4393139595941593, 0.22908494992992318],
 [-0.27232849651909924, 0.3626095530204497],
 [-0.7475570029864934, 0.12545860655212407],
 [0.0, 0.5120167479282537],
 [-2.3401102500224344, 0.0329610234844542],
 [-0.5525821848018447, 0.1285896258951119],
 [-0.4359896885978832, 0.22537533325459047],
 [-0.9065231157491423, 0.1896438851276178],
 [-0.7920786898582686, 0.21993304973497718],
 [-0.98637720643027, 0.13279862285568245],
 [-0.987505816182802, 0.13119128469046512]]

In [303]:
user

[5, 1, 2, 3, 4, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1]

In [304]:
user_data = [0]*len(user)
select = [0]*len(user) # 유저의 카테고리 선택여부 저장

for i in range(len(user[:6])): # 첫번째 카테고리에 구간별 중앙값 부여
    if(user[i] != 0):
        user_data[i] = first_category[i][user[i]-1]
        select[i] = 1
for j in range(len(user[6:])): # 두번째 카테고리에 평균을 중앙값으로 부여
    if(user[j+6] != 0):
        user_data[j+6] = second_category[j][1]
        select[j+6] = 1
    else:
        user_data[j+6] = second_category[j][0]
user_df = pd.DataFrame(user_data,index=df.columns,columns=['user']).T
user_df # 유저 데이터에 대한 기준값 부여 데이터프레임

Unnamed: 0,교통,치안,건강,편의시설,교육,육아,MZ_POP_CNT,COLIVING_NUM,VEGAN_CNT,KIDS_NUM,PARK_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,SAFE_DLVR_NUM,LEISURE_NUM,GYM_NUM,GOLF_NUM,CAR_SHR_NUM,ANI_HSPT_NUM
user,0.902264,-1.722939,-0.377049,-0.014042,0.4081,0.004573,-1.321984,0.0,-1.579378,-0.439314,-0.272328,0.125459,0.0,-2.34011,-0.552582,-0.43599,-0.906523,0.219933,0.132799,0.131191


In [305]:
weight_df = pd.read_excel('1107_가중치.xlsx')
weight_df.rename(columns = {'Unnamed: 0':'분류'},inplace=True)
weight_df.fillna(0,inplace=True)
weight_df.set_index('분류',inplace=True)
weight_df

Unnamed: 0_level_0,교통,방범/치안,건강,편의시설,교육,육아,거주 MZ세대 수,코리빙,비건,키즈카페,공원,스타벅스(스세권),맥도날드,소음,안심택배,"실내레저시설(볼링장,수영장)",헬스장,골프연습장,나눔카거점리스트,반려동물
분류,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
교통,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
방범/치안,0.0,1.0,0.0,0.0,0.0,0.04,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004
건강,0.0,0.02,1.0,0.0,0.02,0.07,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.004,0.0,0.0
편의시설,0.07,0.0,0.0,1.0,0.01,0.18,0,0.0,0.0,0.0,0.04,0.03,0.02,0.0,0.0,0.0,0.0,0.08,0.0,0.0
교육,0.0,0.01,0.02,0.0,1.0,0.035,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009,0.0,0.01,0.0,0.0
육아,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,0.0,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
거주 MZ세대 수,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
코리빙,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
비건,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
키즈카페,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [306]:
df

Unnamed: 0_level_0,교통,치안,건강,편의시설,교육,육아,MZ_POP_CNT,COLIVING_NUM,VEGAN_CNT,KIDS_NUM,PARK_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,SAFE_DLVR_NUM,LEISURE_NUM,GYM_NUM,GOLF_NUM,CAR_SHR_NUM,ANI_HSPT_NUM
DONG_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1111051500,-1.251373,-0.558933,-0.989314,-0.944897,-1.689682,-0.803218,-1.093461,0.000000,-0.310295,-0.439314,1.266253,-0.565210,0.000000,1.296060,-0.178460,-0.203745,-0.906523,-0.792079,-0.878912,-0.583147
1111053000,0.060759,-0.168852,0.603788,0.548133,-0.968695,-0.415289,-0.907741,0.594987,0.788295,-0.439314,4.871286,1.642479,0.000000,1.504113,-0.552582,0.049270,1.341133,-0.001489,0.136329,-0.565065
1111054000,-0.736808,-0.908603,-0.994077,-0.641856,-2.957494,-0.674763,-1.242268,0.000000,0.317623,-0.439314,1.319948,0.020247,0.000000,1.458929,-0.552582,-0.435990,-0.597282,-0.792079,-0.801018,-0.987506
1111055000,-1.673661,-0.734128,-1.083540,-0.925136,-1.263754,-0.821676,-1.114231,0.000000,-1.150577,-0.439314,0.772822,-0.747557,0.000000,0.656637,-0.552582,-0.435990,-0.771202,-0.506491,-0.378039,-0.300808
1111056000,-1.639508,-2.309248,-1.084751,-1.150089,-2.462022,-0.852172,-1.231116,0.000000,-1.340511,-0.439314,-0.272328,-0.587830,0.000000,-0.461063,-0.552582,-0.435990,-0.819945,-0.609361,-0.986377,-0.987506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174065000,0.308243,0.526017,1.361103,1.593932,0.765966,-0.009298,0.957008,0.000000,0.183004,0.895012,-0.272328,0.890140,0.000000,0.521795,0.404127,1.048760,1.385863,1.465626,1.761738,1.080560
1174066000,-0.046442,0.474366,1.220490,1.434644,0.705841,-0.058585,0.560270,0.000000,0.130425,0.819839,-0.272328,0.832997,0.000000,0.485124,0.350228,0.965113,1.256714,1.338431,1.606915,0.964050
1174068500,-0.052715,0.202254,0.529415,0.269312,0.074774,-0.492045,-0.071260,0.000000,-1.134137,0.178657,0.820986,0.229103,0.000000,0.473358,-0.109498,0.389176,0.367494,0.552288,0.286368,0.928076
1174069000,-1.348040,-0.166469,-0.507202,-0.415119,-0.042176,-0.884147,-1.321984,0.000000,-1.069485,-0.439314,2.306467,-0.747557,1.578649,0.577682,-0.552582,-0.435990,-0.238741,0.264906,-0.385974,-0.140329


In [307]:
def weighting(user_df, weight_df, select, user_name):
    values = user_df.loc[user_name].values
    weight = weight_df[weight_df.columns].values
    w = [1] * len(weight)
    for i in range(len(weight)):
        if(select[i] == 1):
            for k in range(len(weight[i])):
                w[i] += weight[i][k]

    weighted_user_data = []
    for i in range(len(values)):
        weighted_data = values[i] * w[i]
        weighted_user_data.append(weighted_data)
    weighted_user_df = pd.DataFrame(weighted_user_data,index=density_data.columns,columns=['user']).T
    return weighted_user_df

In [308]:
weighted_user_df = weighting(user_df, weight_df, select, 'user')
weighted_user_df

Unnamed: 0,CAR_SHR_NUM,SAFE_DLVR_NUM,ANI_HSPT_NUM,LEISURE_NUM,KIDS_NUM,GYM_NUM,GOLF_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,PARK_NUM,COLIVING_NUM,MZ_POP_CNT,VEGAN_CNT,교통,교육,육아,치안,건강,편의시설
user,1.804528,-3.521687,-0.800853,-0.034123,0.85048,0.012806,-1.321984,0.0,-1.579378,-0.439314,-0.272328,0.282282,0.0,-2.34011,-0.552582,-0.43599,-0.906523,0.567427,0.265597,0.269729


In [309]:
user_scaled = [weighted_user_df.loc['user'].values]

In [310]:
# 유저 스케일 데이터 입력 시 해당 클러스터 출력 함수
def user_clustering(basic_df, df, user_scaled,first_pca,first_kmeans):
    user_pca = first_pca.transform(user_scaled)
    user_first = first_kmeans.predict(user_pca)[0]
    
    second_kmeans, second_pca,second_cluster = second_clustering(basic_df,df,user_first)
    user_pca_2 = second_pca.transform(user_scaled)
    user_second = second_kmeans.predict(user_pca_2)[0]
    result_cluster = second_cluster[second_cluster['km_cluster']==user_second]
    return user_second, result_cluster

In [311]:
user_group, user_include_df = user_clustering(basic_df, df , user_scaled, first_pca, first_kmeans)
user_include_df

Unnamed: 0_level_0,GU,DONG,AREA,CAR_SHR_NUM,SAFE_DLVR_NUM,ANI_HSPT_NUM,LEISURE_NUM,KIDS_NUM,GYM_NUM,GOLF_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,PARK_NUM,COLIVING_NUM,MZ_POP_CNT,VEGAN_CNT,교통,교육,육아,치안,건강,편의시설,km_cluster,pca_x,pca_y
DONG_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
1111057000,종로구,무악동,0.36,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,2016.000000,0.0,0.0,1822,1.000000,4.830000,15.800000,13.000000,13.0,9.700000,3.380000,0,-1.699955,0.189483
1111060000,종로구,가회동,0.54,0.000000,0.000000,0.000000,1.000000,0.000000,1.000000,0.000000,2.000000,0.0,8064.000000,0.0,0.0,1038,6.000000,8.610000,3.330000,3.000000,33.0,8.520000,16.770000,0,-1.463824,1.008411
1117051000,용산구,후암동,0.86,3.000000,0.000000,2.000000,0.000000,0.000000,1.000000,1.000000,0.000000,0.0,1625.000000,0.0,1.0,5338,2.000000,10.350000,23.200000,15.000000,26.0,23.920000,10.540000,0,-1.681596,-0.075667
1117057000,용산구,원효로2동,0.71,2.000000,0.000000,2.000000,0.000000,0.000000,1.000000,3.000000,0.000000,0.0,8125.000000,0.0,0.0,4038,0.000000,9.720000,20.100000,11.000000,24.0,8.700000,11.040000,0,-1.610925,-0.056581
1117062500,용산구,한강로동,2.90,13.000000,0.000000,2.000000,4.000000,4.000000,8.000000,4.000000,6.000000,0.0,8125.000000,0.0,0.0,8937,6.000000,29.700000,11.330000,53.000000,41.0,86.980000,54.320000,0,-1.555222,-0.164857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174055000,강동구,고덕1동,1.73,1.000000,0.000000,2.000000,0.000000,0.500000,3.000000,0.500000,0.500000,0.0,2163.500000,0.5,0.0,5111,1.500000,13.970000,42.250000,43.500000,46.0,28.590000,12.180000,0,-2.562638,0.087224
1174056000,강동구,고덕2동,2.01,1.000000,0.000000,2.000000,0.000000,0.500000,3.000000,0.500000,0.500000,0.0,2163.500000,0.5,0.0,6015,1.500000,11.180000,42.250000,43.500000,46.0,28.590000,12.180000,0,-2.885983,-0.151919
1174058000,강동구,암사2동,1.18,1.333333,0.666667,1.666667,0.333333,1.333333,5.333333,2.666667,0.333333,0.0,1442.333333,0.0,0.0,4706,0.666667,10.783333,47.823333,13.666667,45.0,35.533333,16.053333,0,-1.481691,-0.733892
1174069000,강동구,둔촌1동,0.92,2.000000,0.000000,1.500000,0.000000,0.000000,4.000000,3.000000,0.000000,0.5,2163.500000,1.0,0.0,33,0.500000,4.550000,28.535000,0.000000,33.5,23.630000,11.855000,0,-1.985705,1.027566


In [312]:
def similarity(user_df, df, user_name, num): # 유저 데이터, 유사도 측정을 위한 데이터, 유저 이름, 원하는 순위
    con_data = pd.concat([user_df.loc[[user_name]],df])
    rc_sim = cosine_similarity(con_data,con_data)
    sim_matrix = pd.DataFrame(rc_sim,columns=con_data.index).loc[[0]].T
    rank = sim_matrix[0].sort_values(ascending=False) # 유사도 순서로 정렬
    ranking = rank[1:num+1].index.tolist() # 1~n 위 리스트
    return ranking

In [313]:
similarity(user_df, df.loc[user_include_df.index.values], "user",3)

[1147064000, 1147061100, 1165062000]

In [314]:
user_include_df.loc[similarity(user_df, df.loc[user_include_df.index.values], "user",3)]

Unnamed: 0_level_0,GU,DONG,AREA,CAR_SHR_NUM,SAFE_DLVR_NUM,ANI_HSPT_NUM,LEISURE_NUM,KIDS_NUM,GYM_NUM,GOLF_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,PARK_NUM,COLIVING_NUM,MZ_POP_CNT,VEGAN_CNT,교통,교육,육아,치안,건강,편의시설,km_cluster,pca_x,pca_y
DONG_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
1147064000,양천구,신정3동,2.72,4.333333,0.333333,2.666667,0.833333,0.666667,8.166667,4.5,0.666667,0.166667,203.833333,0.166667,0.0,12375,4.666667,31.296667,105.103333,50.666667,26.5,48.51,17.635,0,-2.30946,-1.06176
1147061100,양천구,신월7동,1.19,2.285714,0.714286,1.714286,1.0,0.285714,1.857143,1.714286,0.285714,0.285714,174.714286,0.142857,0.0,5098,1.428571,14.198571,25.77,32.142857,26.857143,23.66,11.491429,0,-2.01373,-0.45974
1165062000,서초구,방배3동,2.4,2.833333,0.333333,2.833333,0.5,0.166667,9.333333,5.666667,1.5,0.166667,697.833333,0.0,0.0,4735,5.666667,24.813333,50.45,39.5,22.333333,43.26,19.01,0,-2.346043,-0.911946


In [315]:
user = [5,1,2,3,4,3,0,0,0,0,0,1,0,0,0,0,0,1,1,1]