# 유사도 메트릭스를 활용한 추천

In [1]:
import pandas as pd
import xlrd
import re
import numpy as np
import math

대출금액을 신용대출과 담보대출금액으로 나눔

In [2]:
def add_sinyong_amt_column(file):
    preprocessed_data=pd.read_excel(file)
    preprocessed_data['신용금액']=preprocessed_data['총합금액']-preprocessed_data['담보금액']
    return preprocessed_data

In [3]:
preprocessed_data=add_sinyong_amt_column('preprocessed_data.xlsx')

In [4]:
def feature_select(dataframe,important_features):
    # 중요 feature selection
    df=pd.DataFrame(dataframe,columns=important_features)
# * RTI(Repayment to Income): 월상환액/월급여
# * LTI(Loan to Income): 총부채/연소득
    df.reset_index(inplace=True)
    return df

뽑을 feature 들 정하기

In [5]:
ipt_ftrs=[
        '연금리',
        '상품',
        '만기',
        '대출금액',
        '8percent예측불량률',
        '신용금액',
        '담보금액',
        '월소득',
        'RTI',
        '월가처분소득',
        'LTI',
        '근무기간']

In [6]:
# 전제 채권 ipt_ftrs안의 features
df=feature_select(preprocessed_data,ipt_ftrs)

In [7]:
# 현재 투자가능한 채권 ipt_ftrs안의 features
df_now_funding=feature_select(preprocessed_data[preprocessed_data['투자현황']=='None'],ipt_ftrs)

In [8]:
# 딕셔너리 벨류값으로 키값 찾기
def dictionary_value_to_key(val,words_basket):
    for k, v in words_basket.items():
        if val==v: 
            return k

In [9]:
# 상품을 카운트벡터라이즈화(원핫인코딩처럼) df=들어갈 것, original=words_basket만들기 위함
def df_CountVectorizer(df,original):
    from sklearn.feature_extraction.text import CountVectorizer
    corpus = original['상품']
    vectorizer = CountVectorizer()
    vectorizer.fit(corpus)
    words_basket=vectorizer.vocabulary_
    df_corpus=pd.DataFrame(vectorizer.transform(corpus).toarray())
    df_corpus.rename(columns=lambda x: str('one_hot_'+dictionary_value_to_key(x,words_basket)), inplace=True)
    df_join_corpus=df.join(df_corpus)
    del df_join_corpus['index']
    del df_join_corpus['상품']
    df_join_corpus.index=df['index']
    return df_join_corpus

In [10]:
# 전체 채권과 / 투자가능한 채권
df_after_cnt_vec=df_CountVectorizer(df,df)

In [11]:
df_now_funding_after_cnt_vec=df_CountVectorizer(df_now_funding,df)

In [12]:
df_after_cnt_vec.to_excel('df_after_cnt_vec.xlsx','df_after_cnt_vec')

In [13]:
# 전체채권과 투자가능채권의 유사도 메트릭스 구하기
from scipy.spatial import distance
def make_sim_matrix(distance_func=distance.cosine):
    bonds1 = df_after_cnt_vec.index
    bonds2 = df_now_funding_after_cnt_vec.index
    sim_matrix = []
    for u1 in bonds1:
        similarites = []
        u1_vec = df_after_cnt_vec.loc[u1].values.tolist()
        for u2 in bonds2:
            u2_vec = df_now_funding_after_cnt_vec.loc[u2].values.tolist()
            similarites.append(round(distance_func(u1_vec, u2_vec),5))
        sim_matrix.append(similarites)

    sim_matrix = pd.DataFrame(sim_matrix, index=bonds1, columns=bonds2)
    # 자기자신은 가장 높은값 줘버려서 추천 안되도록 하기
    for i in sim_matrix.columns: 
        if sim_matrix[i][i]==0:
            sim_matrix[i][i]=1
    return sim_matrix

In [14]:
sim_matrix=make_sim_matrix()

In [15]:
sim_matrix

index,1089,1092,1100,1128
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
56,0.85781,0.49327,0.50051,0.16526
58,0.26074,0.30294,0.25222,0.49658
59,0.56837,0.04426,0.04125,0.14079
60,0.61764,0.03133,0.03664,0.11860
62,0.60963,0.02311,0.03194,0.13217
63,0.52772,0.33521,0.31255,0.48540
64,0.76145,0.24361,0.23972,0.01558
66,0.49173,0.12230,0.09984,0.22671
67,0.18098,0.53596,0.46616,0.69566
69,0.55637,0.01430,0.02525,0.17240


In [16]:
# 전체 채권들 중 중도상환이 된 경우 그 채권과 가장 유사한(자기자신 제외) 현재 투자가능한 채권을 보여줌
BASE_URL='https://8percent.kr/investment/detail/'
def df_reccomendation(sim_matrix,n):
    df_rcm=pd.DataFrame()
    df_rcm['url_number']=sim_matrix.index
    for i in range(n):
        df_rcm_org=pd.DataFrame(columns=['url_number',i])
        for idx,row in sim_matrix.iterrows():
            row.sort()
            df_rcm_org.loc[len(df_rcm_org)] = [
                idx, 
                BASE_URL+str(int(row.index.values[i]))
                ]
        df_rcm = pd.merge(df_rcm,df_rcm_org, on='url_number')
    return df_rcm

In [17]:
result=df_reccomendation(sim_matrix,3)
result.head(1)



Unnamed: 0,url_number,0,1,2
0,56,https://8percent.kr/investment/detail/1128,https://8percent.kr/investment/detail/1092,https://8percent.kr/investment/detail/1100


In [18]:
result[result['url_number']==1011]

Unnamed: 0,url_number,0,1,2
815,1011,https://8percent.kr/investment/detail/1100,https://8percent.kr/investment/detail/1092,https://8percent.kr/investment/detail/1128
