In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import RobustScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

import warnings

path_str = ""
tmp_df = pd.DataFrame()

def merge_area_data():
    # 행정동별 데이터 로드
    df = pd.read_csv(path_str + "행정동_컬럼추가_최종ver.csv", index_col=0)
    df.rename(columns={"인구수": "MZ_POP_CNT"}, inplace=True)

    # 인구 밀도 데이터 로드
    density_df = pd.read_excel(path_str + '인구밀도.xlsx')
    density_df['GU_DONG'] = density_df['GU'] + density_df['DONG']
    df['GU_DONG'] = df['GU'] + df['DONG']
    density_df.drop(['GU', 'DONG', 'POP', 'DENSITY'], axis=1, inplace=True)

    # 행정동 데이터, 밀도 데이터 병합
    tmp = pd.merge(df, density_df, on='GU_DONG')
    tmp.drop(['GU_DONG'], axis=1, inplace=True)

    # 컬럼 순서 정렬
    tmp = tmp[['GU', 'DONG', 'DONG_CODE', 'AREA', 'ACADEMY_NUM', 'KINDER_NUM', 'FIRE_NUM',
               'ELE_SCH_NUM', 'MID_SCH_NUM', 'HIGH_SCH_NUM', 'CCTV_NUM', 'POLICE_NUM',
               'BIKE_NUM', 'CAR_SHR_NUM', 'SUBWAY_NUM', 'SAFE_DLVR_NUM', 'DPTM_NUM',
               'ANI_HSPT_NUM', 'PHARM_NUM', 'LEISURE_NUM', 'KIDS_NUM', 'SPORT_NUM',
               'GYM_NUM', 'GOLF_NUM', 'STARBUCKS_NUM', 'MC_NUM', 'CON_NUM',
               'NOISE_VIBRATION_NUM', 'CHILD_MED_NUM', 'CAFE_NUM', 'PARK_NUM',
               'HOSPITAL_NUM', 'BUS_CNT', 'RETAIL_NUM', 'COLIVING_NUM', 'MZ_POP_CNT', 'VEGAN_CNT']]
    # 불필요 컬럼 제거
    tmp = tmp.drop(['SPORT_NUM'], axis=1)
    return tmp


def assembling_features(df):
    global tmp_df
    # 피쳐합
    tmp_df = df.copy()
    # 교통
    tmp_df['교통'] = tmp_df['SUBWAY_NUM'] + 0.93 * tmp_df['BUS_CNT'] + 0.06 * tmp_df['BIKE_NUM']
    tmp_df = tmp_df.drop(['SUBWAY_NUM', 'BUS_CNT', 'BIKE_NUM'], axis=1)

    # 교육
    tmp_df['교육'] = (0.07) * tmp_df['MID_SCH_NUM'] + (0.03) * tmp_df['HIGH_SCH_NUM'] + tmp_df['ACADEMY_NUM'] * (0.7) + (
        0.9) * tmp_df['ELE_SCH_NUM']
    tmp_df = tmp_df.drop(['MID_SCH_NUM', 'HIGH_SCH_NUM', 'ACADEMY_NUM', 'ELE_SCH_NUM'], axis=1)

    # 육아
    tmp_df['육아'] = tmp_df['CHILD_MED_NUM'] + tmp_df['KINDER_NUM']
    tmp_df = tmp_df.drop(['CHILD_MED_NUM', 'KINDER_NUM'], axis=1)

    # 치안
    tmp_df['치안'] = tmp_df['POLICE_NUM'] + tmp_df['CCTV_NUM'] + tmp_df['FIRE_NUM']
    tmp_df = tmp_df.drop(['POLICE_NUM', 'CCTV_NUM', 'FIRE_NUM'], axis=1)

    # 건강
    tmp_df['건강'] = (0.94) * tmp_df['HOSPITAL_NUM'] + tmp_df['PHARM_NUM']
    tmp_df = tmp_df.drop(['HOSPITAL_NUM', 'PHARM_NUM'], axis=1)

    # 편의시설
    tmp_df['편의시설'] = 0.04 * tmp_df['DPTM_NUM'] + 0.44 * tmp_df['CON_NUM'] + 0.25 * tmp_df['CAFE_NUM'] + 0.27 * tmp_df[
        'RETAIL_NUM']
    tmp_df = tmp_df.drop(['DPTM_NUM', 'CON_NUM', 'CAFE_NUM', 'RETAIL_NUM'], axis=1)

    tmp_df.set_index('DONG_CODE', inplace=True)

    return tmp_df


def robust_scaling(df):
    robust_scaler = RobustScaler()

    robust_scaler.fit(df)

    robust_data = robust_scaler.transform(df)
    ro_df = pd.DataFrame(robust_data)
    ro_df.index = df.index
    ro_df.columns = df.columns
    return ro_df


def preprocessing_df():
    area_df = merge_area_data()
    assem_df = assembling_features(area_df)

    tmp_data = assem_df.iloc[:, 3:]
    df = tmp_data.div(assem_df['AREA'], axis=0)

    max_lim_log_list = ["교통", "치안", "교육", "COLIVING_NUM", "STARBUCKS_NUM", "MC_NUM", "NOISE_VIBRATION_NUM", "VEGAN_CNT"]

    for f in max_lim_log_list:
        quan = df[f].quantile(0.95)
        df[f] = np.where(df[f] > quan, quan, df[f])
        df[f] = np.log1p(df[f])

    max_lim_list = ["LEISURE_NUM", "GOLF_NUM", "건강", "편의시설"]
    for f in max_lim_list:
        quan = df[f].quantile(0.95)
        df[f] = np.where(df[f] > quan, quan, df[f])

    ro_df = robust_scaling(df)
    ro_df = ro_df[['교통', '치안', '건강', '편의시설', '교육',
             '육아', 'MZ_POP_CNT', 'COLIVING_NUM', 'VEGAN_CNT', 'KIDS_NUM',
             'PARK_NUM', 'STARBUCKS_NUM', 'MC_NUM', 'NOISE_VIBRATION_NUM',
             'SAFE_DLVR_NUM', 'LEISURE_NUM', 'GYM_NUM', 'GOLF_NUM', 'CAR_SHR_NUM',
             'ANI_HSPT_NUM']]

    return ro_df

def first_clustering(df):
    global tmp_df
    basic_pca = PCA(n_components=2, random_state=0)
    basic_pca_transformed = basic_pca.fit_transform(df)

    # density_data = minmax_norm(density_data)
    first_kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=300, random_state=0)
    first_kmeans.fit(basic_pca_transformed)

    basic_df = tmp_df.copy()
    basic_df['km_cluster'] = first_kmeans.labels_

    basic_df['pca_x'] = basic_pca_transformed[:, 0]
    basic_df['pca_y'] = basic_pca_transformed[:, 1]

    return basic_df, first_kmeans, basic_pca

def second_clustering(basic_df, df,  user_first):
    cluster_num = [3,3,2,0]
    second_cluster = basic_df[basic_df['km_cluster'] == user_first]
    cluster_data = df.loc[second_cluster.index.values]
    second_pca = PCA(n_components=2)
    second_pca_transformed = second_pca.fit_transform(cluster_data)
    second_kmeans = KMeans(n_clusters=cluster_num[user_first], init='k-means++', max_iter=400, random_state=0)
    second_kmeans.fit(second_pca_transformed)

    cluster_tmp = second_cluster.copy()
    cluster_tmp['km_cluster'] = second_kmeans.labels_
    return second_kmeans, second_pca, cluster_tmp

def create_category(df):
    first_category = []
    for column in df.columns[:6]:
        category = []
        for i in range(0,81,20):
            x = (df[column].quantile(i/100) + df[column].quantile((i+20)/100)) / 2
            category.append(x)
        first_category.append(category)

    second_category = []
    for column in df.columns[6:]:
        cate = [df[column].quantile(0.25), df[column].quantile(0.75)]
        second_category.append(cate)

    return first_category, second_category

def user_scaling(first_category, second_category, user , df):
    user_data = [0] * len(user)
    select = [0] * len(user)  # 유저의 카테고리 선택여부 저장

    for i in range(len(user[:6])):  # 첫번째 카테고리에 구간별 중앙값 부여
        if (user[i] != 0):
            user_data[i] = first_category[i][user[i] - 1]
            select[i] = 1
    for j in range(len(user[6:])):  # 두번째 카테고리에 평균을 중앙값으로 부여
        if (user[j + 6] != 0):
            user_data[j + 6] = second_category[j][1]
            select[j + 6] = 1
        else:
            user_data[j + 6] = second_category[j][0]
    user_df = pd.DataFrame(user_data, index=df.columns, columns=['user']).T
    return user_df,select


def weighting(user_df, df, select, user_name):
    weight_df = pd.read_excel('1107_가중치.xlsx')
    weight_df.rename(columns={'Unnamed: 0': '분류'}, inplace=True)
    weight_df.fillna(0, inplace=True)
    weight_df.set_index('분류', inplace=True)

    values = user_df.loc[user_name].values
    weight = weight_df[weight_df.columns].values
    w = [1] * len(weight)
    for i in range(len(weight)):
        if(select[i] == 1):
            for k in range(len(weight[i])):
                w[i] += weight[i][k]

    weighted_user_data = []
    for i in range(len(values)):
        weighted_data = values[i] * w[i]
        weighted_user_data.append(weighted_data)
    weighted_user_df = pd.DataFrame(weighted_user_data,index=df.columns,columns=['user']).T
    return weighted_user_df


# 유저 스케일 데이터 입력 시 해당 클러스터 출력 함수
def user_clustering(basic_df, df, user_scaled, first_pca, first_kmeans):
    user_pca = first_pca.transform(user_scaled)
    user_first = first_kmeans.predict(user_pca)[0]

    second_kmeans, second_pca, second_cluster = second_clustering(basic_df, df, user_first)
    user_pca_2 = second_pca.transform(user_scaled)
    user_second = second_kmeans.predict(user_pca_2)[0]
    result_cluster = second_cluster[second_cluster['km_cluster'] == user_second]
    return user_second, result_cluster

def similarity(user_df, df, user_name, num): # 유저 데이터, 유사도 측정을 위한 데이터, 유저 이름, 원하는 순위
    con_data = pd.concat([user_df.loc[[user_name]],df])
    rc_sim = cosine_similarity(con_data,con_data)
    sim_matrix = pd.DataFrame(rc_sim,columns=con_data.index).loc[[0]].T
    rank = sim_matrix[0].sort_values(ascending=False) # 유사도 순서로 정렬
    ranking = rank[1:num+1].index.tolist() # 1~n 위 리스트
    return ranking

'교통', '치안', '건강', '편의시설', '교육', '육아', 'MZ_POP_CNT', 'COLIVING_NUM',
       'VEGAN_CNT', 'KIDS_NUM', 'PARK_NUM', 'STARBUCKS_NUM', 'MC_NUM',
       'NOISE_VIBRATION_NUM', 'SAFE_DLVR_NUM', 'LEISURE_NUM', 'GYM_NUM',
       'GOLF_NUM', 'CAR_SHR_NUM', 'ANI_HSPT_NUM'

In [9]:
user = [5,3,3,5,2,1,0,0,0,0,0,0,0,1,1,1,1,1,1,1]

In [5]:
df = preprocessing_df()
df




Unnamed: 0_level_0,교통,치안,건강,편의시설,교육,육아,MZ_POP_CNT,COLIVING_NUM,VEGAN_CNT,KIDS_NUM,PARK_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,SAFE_DLVR_NUM,LEISURE_NUM,GYM_NUM,GOLF_NUM,CAR_SHR_NUM,ANI_HSPT_NUM
DONG_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1111051500,-1.251373,-0.558933,-0.989314,-0.944897,-1.689682,-0.803218,-1.093461,0.000000,-0.310295,-0.439314,1.266253,-0.565210,0.000000,1.296060,-0.178460,-0.203745,-0.906523,-0.792079,-0.878912,-0.583147
1111053000,0.060759,-0.168852,0.603788,0.548133,-0.968695,-0.415289,-0.907741,0.594987,0.788295,-0.439314,4.871286,1.642479,0.000000,1.504113,-0.552582,0.049270,1.341133,-0.001489,0.136329,-0.565065
1111054000,-0.736808,-0.908603,-0.994077,-0.641856,-2.957494,-0.674763,-1.242268,0.000000,0.317623,-0.439314,1.319948,0.020247,0.000000,1.458929,-0.552582,-0.435990,-0.597282,-0.792079,-0.801018,-0.987506
1111055000,-1.673661,-0.734128,-1.083540,-0.925136,-1.263754,-0.821676,-1.114231,0.000000,-1.150577,-0.439314,0.772822,-0.747557,0.000000,0.656637,-0.552582,-0.435990,-0.771202,-0.506491,-0.378039,-0.300808
1111056000,-1.639508,-2.309248,-1.084751,-1.150089,-2.462022,-0.852172,-1.231116,0.000000,-1.340511,-0.439314,-0.272328,-0.587830,0.000000,-0.461063,-0.552582,-0.435990,-0.819945,-0.609361,-0.986377,-0.987506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174065000,0.308243,0.526017,1.361103,1.593932,0.765966,-0.009298,0.957008,0.000000,0.183004,0.895012,-0.272328,0.890140,0.000000,0.521795,0.404127,1.048760,1.385863,1.465626,1.761738,1.080560
1174066000,-0.046442,0.474366,1.220490,1.434644,0.705841,-0.058585,0.560270,0.000000,0.130425,0.819839,-0.272328,0.832997,0.000000,0.485124,0.350228,0.965113,1.256714,1.338431,1.606915,0.964050
1174068500,-0.052715,0.202254,0.529415,0.269312,0.074774,-0.492045,-0.071260,0.000000,-1.134137,0.178657,0.820986,0.229103,0.000000,0.473358,-0.109498,0.389176,0.367494,0.552288,0.286368,0.928076
1174069000,-1.348040,-0.166469,-0.507202,-0.415119,-0.042176,-0.884147,-1.321984,0.000000,-1.069485,-0.439314,2.306467,-0.747557,1.578649,0.577682,-0.552582,-0.435990,-0.238741,0.264906,-0.385974,-0.140329


In [27]:
df = preprocessing_df()

basic_df, first_kmeans, first_pca = first_clustering(df)
first_category, second_category = create_category(df)



In [10]:
from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()

minmax_scaler.fit(df)

minmax_data = minmax_scaler.transform(df)
minmax_df = pd.DataFrame(minmax_data)
minmax_df.index = df.index
minmax_df.columns = df.columns

In [11]:
minmax_df.columns

Index(['교통', '치안', '건강', '편의시설', '교육', '육아', 'MZ_POP_CNT', 'COLIVING_NUM',
       'VEGAN_CNT', 'KIDS_NUM', 'PARK_NUM', 'STARBUCKS_NUM', 'MC_NUM',
       'NOISE_VIBRATION_NUM', 'SAFE_DLVR_NUM', 'LEISURE_NUM', 'GYM_NUM',
       'GOLF_NUM', 'CAR_SHR_NUM', 'ANI_HSPT_NUM'],
      dtype='object')

In [28]:
user_df,select = user_scaling(first_category, second_category, user,df)
weighted_user_df = weighting(user_df, df, select, 'user')
user_scaled = [weighted_user_df.loc['user'].values]
user_group, user_include_df = user_clustering(basic_df, df , user_scaled, first_pca, first_kmeans)
result_dong_list = similarity(user_df, df.loc[user_include_df.index.values], "user",3)
print(user_include_df.loc[result_dong_list]['DONG'].values)



['가산동' '삼성1동' '소공동']




In [29]:
df

Unnamed: 0_level_0,교통,치안,건강,편의시설,교육,육아,MZ_POP_CNT,COLIVING_NUM,VEGAN_CNT,KIDS_NUM,PARK_NUM,STARBUCKS_NUM,MC_NUM,NOISE_VIBRATION_NUM,SAFE_DLVR_NUM,LEISURE_NUM,GYM_NUM,GOLF_NUM,CAR_SHR_NUM,ANI_HSPT_NUM
DONG_CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1111051500,-1.251373,-0.558933,-0.989314,-0.944897,-1.689682,-0.803218,-1.093461,0.000000,-0.310295,-0.439314,1.266253,-0.565210,0.000000,1.296060,-0.178460,-0.203745,-0.906523,-0.792079,-0.878912,-0.583147
1111053000,0.060759,-0.168852,0.603788,0.548133,-0.968695,-0.415289,-0.907741,0.594987,0.788295,-0.439314,4.871286,1.642479,0.000000,1.504113,-0.552582,0.049270,1.341133,-0.001489,0.136329,-0.565065
1111054000,-0.736808,-0.908603,-0.994077,-0.641856,-2.957494,-0.674763,-1.242268,0.000000,0.317623,-0.439314,1.319948,0.020247,0.000000,1.458929,-0.552582,-0.435990,-0.597282,-0.792079,-0.801018,-0.987506
1111055000,-1.673661,-0.734128,-1.083540,-0.925136,-1.263754,-0.821676,-1.114231,0.000000,-1.150577,-0.439314,0.772822,-0.747557,0.000000,0.656637,-0.552582,-0.435990,-0.771202,-0.506491,-0.378039,-0.300808
1111056000,-1.639508,-2.309248,-1.084751,-1.150089,-2.462022,-0.852172,-1.231116,0.000000,-1.340511,-0.439314,-0.272328,-0.587830,0.000000,-0.461063,-0.552582,-0.435990,-0.819945,-0.609361,-0.986377,-0.987506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1174065000,0.308243,0.526017,1.361103,1.593932,0.765966,-0.009298,0.957008,0.000000,0.183004,0.895012,-0.272328,0.890140,0.000000,0.521795,0.404127,1.048760,1.385863,1.465626,1.761738,1.080560
1174066000,-0.046442,0.474366,1.220490,1.434644,0.705841,-0.058585,0.560270,0.000000,0.130425,0.819839,-0.272328,0.832997,0.000000,0.485124,0.350228,0.965113,1.256714,1.338431,1.606915,0.964050
1174068500,-0.052715,0.202254,0.529415,0.269312,0.074774,-0.492045,-0.071260,0.000000,-1.134137,0.178657,0.820986,0.229103,0.000000,0.473358,-0.109498,0.389176,0.367494,0.552288,0.286368,0.928076
1174069000,-1.348040,-0.166469,-0.507202,-0.415119,-0.042176,-0.884147,-1.321984,0.000000,-1.069485,-0.439314,2.306467,-0.747557,1.578649,0.577682,-0.552582,-0.435990,-0.238741,0.264906,-0.385974,-0.140329
