In [175]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [176]:
df = pd.read_csv('wondoofin.csv')
cat_df = pd.read_csv('category.csv')

In [177]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 559 entries, 0 to 558
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 559 non-null    int64 
 1   이름                 559 non-null    object
 2   로스터리               559 non-null    object
 3   타입                 559 non-null    object
 4   로스팅 포인트            559 non-null    object
 5   지속가능성              113 non-null    object
 6   컵 노트               559 non-null    object
 7   바디감                559 non-null    int64 
 8   신맛                 559 non-null    int64 
 9   단맛                 559 non-null    int64 
 10  쓴맛                 559 non-null    int64 
 11  커피 소개              559 non-null    object
 12  나라                 543 non-null    object
 13  로스터리 소개            559 non-null    object
 14  로스터리 주소            550 non-null    object
 15  이름.1               559 non-null    object
 16  식품의 유형             554 non-null    object
 1

In [178]:
df.columns

Index(['id', '이름', '로스터리', '타입', '로스팅 포인트', '지속가능성', '컵 노트', '바디감', '신맛', '단맛',
       '쓴맛', '커피 소개', '나라', '로스터리 소개', '로스터리 주소', '이름.1', '식품의 유형',
       '제조원 및 소재지', '유통기한', '제조일자', '내용량', '보관 방법', '원재료 및 함량',
       '제품문의 관련 주소 및 전화번호', '카페인', '로스터리ID'],
      dtype='object')

In [179]:
df.shape

(559, 26)

***
null 값 처리
- 지속가능성 : 113
- 나라 : 543
***
- 로스터리 주소 : 550
- 제조원 및 소재지 : 557/559
- 유통기한 : 557
- 보관 방법 : 545
- 식품의 유형 : 554
***

In [180]:
# 리스트 처리
df['컵 노트'] = df['컵 노트'].apply(literal_eval)
df['컵 노트'][0]

['토피', '맥아', '흑설탕']

***

In [181]:
#################################
########## 지속 가능성 ##########
#################################
for i in range(len(df)):
    if df['지속가능성'][i] is np.nan:
        df.at[i, '지속가능성'] = 0
        
df['지속가능성']

0         0
1         0
2       유기농
3      공정무역
4         0
       ... 
554       0
555       0
556       0
557       0
558       0
Name: 지속가능성, Length: 559, dtype: object

***
나라
***

In [182]:
df['나라'] = df['나라'].str.split(', ')
df['나라'][0]

['브라질', '과테말라']

In [183]:
for i in range(len(df)):
    if df['나라'][i] is np.nan:
        df.at[i, '나라'] = 0
        
df['나라']

0           [브라질, 과테말라]
1                 [세라도]
2                 [아리차]
3             [브라질, 인도]
4              [리무 볼레소]
             ...       
554                   0
555       [에티오피아, 콜롬비아]
556         [브라질, 콜롬비아]
557               [파젠다]
558    [과테말라, 콜롬비아, 인도]
Name: 나라, Length: 559, dtype: object

***
추천에 필요한 값들
1. 카페인 유무 : 카페인 칼럼
2. 블렌드/ 싱글오리진
3. 향 카테고리
4. 맛의 강도 : 쓴맛, 단맛, 바디감, 산미
***

In [184]:
features = pd.get_dummies(df, columns=['타입'], dtype=int)
features.head(2)

Unnamed: 0,id,이름,로스터리,로스팅 포인트,지속가능성,컵 노트,바디감,신맛,단맛,쓴맛,...,제조일자,내용량,보관 방법,원재료 및 함량,제품문의 관련 주소 및 전화번호,카페인,로스터리ID,타입_디카페인,타입_블렌드,타입_싱글오리진
0,43,데일리스윗,언더프레셔,미디엄다크,0,"[토피, 맥아, 흑설탕]",4,2,4,3,...,상단 표기일,140g / 1kg,"직사광선을 피하고 온도, 습도가 낮으며 통풍이 잘 되는 곳에 보관하여 주십시오.","커피원두 100% (브라질산 60%, 과테말라산 40%)",핸디엄 고객센터 1599-2681,1,57,0,1,0
1,3896,브라질 세라도 디카페인,운조커피,미디엄다크,0,[null],4,2,4,3,...,주문 확인 후 제조,500g 1kg,"직사광선을 피하고 온도, 습도가 낮으며 통풍이 잘 되는 곳에 밀폐 보관",커피원두 100%,운조커피 010-4535-8819,0,67,1,0,0


In [185]:
features.columns

Index(['id', '이름', '로스터리', '로스팅 포인트', '지속가능성', '컵 노트', '바디감', '신맛', '단맛', '쓴맛',
       '커피 소개', '나라', '로스터리 소개', '로스터리 주소', '이름.1', '식품의 유형', '제조원 및 소재지',
       '유통기한', '제조일자', '내용량', '보관 방법', '원재료 및 함량', '제품문의 관련 주소 및 전화번호', '카페인',
       '로스터리ID', '타입_디카페인', '타입_블렌드', '타입_싱글오리진'],
      dtype='object')

In [186]:
features = features[['id', '나라', '로스팅 포인트', '지속가능성', '바디감', '신맛', '단맛', '쓴맛', '타입_디카페인', '타입_블렌드', '타입_싱글오리진']]
features.head(2)

Unnamed: 0,id,나라,로스팅 포인트,지속가능성,바디감,신맛,단맛,쓴맛,타입_디카페인,타입_블렌드,타입_싱글오리진
0,43,"[브라질, 과테말라]",미디엄다크,0,4,2,4,3,0,1,0
1,3896,[세라도],미디엄다크,0,4,2,4,3,1,0,0


In [187]:
features['지속가능성'].unique()

array([0, '유기농', '공정무역', '직접무역'], dtype=object)

In [188]:
features['로스팅 포인트'].unique()

array(['미디엄다크', '라이트미디엄', '다크', '미디엄', '라이트'], dtype=object)

In [189]:
features = pd.get_dummies(features, columns=['지속가능성', '로스팅 포인트'], dtype=int)
features.head(2)

Unnamed: 0,id,나라,바디감,신맛,단맛,쓴맛,타입_디카페인,타입_블렌드,타입_싱글오리진,지속가능성_0,지속가능성_공정무역,지속가능성_유기농,지속가능성_직접무역,로스팅 포인트_다크,로스팅 포인트_라이트,로스팅 포인트_라이트미디엄,로스팅 포인트_미디엄,로스팅 포인트_미디엄다크
0,43,"[브라질, 과테말라]",4,2,4,3,0,1,0,1,0,0,0,0,0,0,0,1
1,3896,[세라도],4,2,4,3,1,0,0,1,0,0,0,0,0,0,0,1


In [190]:
notes = pd.read_csv('category.csv')
notes = notes.loc[:, notes.columns!='컵 노트']
notes.head(2)

Unnamed: 0,id,꽃,과일,허브,달콤함,고소함,향료_풍미,초콜릿
0,43,0,0,0,1,0,1,0
1,3896,0,0,0,0,0,0,0


In [191]:
features = pd.merge(features, notes, on="id")
features.head(2)

Unnamed: 0,id,나라,바디감,신맛,단맛,쓴맛,타입_디카페인,타입_블렌드,타입_싱글오리진,지속가능성_0,...,로스팅 포인트_라이트미디엄,로스팅 포인트_미디엄,로스팅 포인트_미디엄다크,꽃,과일,허브,달콤함,고소함,향료_풍미,초콜릿
0,43,"[브라질, 과테말라]",4,2,4,3,0,1,0,1,...,0,0,1,0,0,0,1,0,1,0
1,3896,[세라도],4,2,4,3,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [192]:
features = features.loc[:, features.columns!='나라']
features = features.set_index('id')
features.head(2)

Unnamed: 0_level_0,바디감,신맛,단맛,쓴맛,타입_디카페인,타입_블렌드,타입_싱글오리진,지속가능성_0,지속가능성_공정무역,지속가능성_유기농,...,로스팅 포인트_라이트미디엄,로스팅 포인트_미디엄,로스팅 포인트_미디엄다크,꽃,과일,허브,달콤함,고소함,향료_풍미,초콜릿
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
43,4,2,4,3,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,1,0
3896,4,2,4,3,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0


In [193]:
features.to_csv('features.csv')

Unnamed: 0_level_0,바디감,신맛,단맛,쓴맛,타입_디카페인,타입_블렌드,타입_싱글오리진,지속가능성_0,지속가능성_공정무역,지속가능성_유기농,...,로스팅 포인트_라이트미디엄,로스팅 포인트_미디엄,로스팅 포인트_미디엄다크,꽃,과일,허브,달콤함,고소함,향료_풍미,초콜릿
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
43,4,2,4,3,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,1,0
3896,4,2,4,3,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
3458,3,4,5,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
22,4,1,3,4,0,1,0,0,1,0,...,0,0,0,0,0,0,1,1,0,1
180,5,3,5,1,0,0,1,1,0,0,...,0,0,1,0,1,0,0,0,0,1


***
contetns based rec
***

In [None]:
import pandas as pd
# import numpy as np
# from ast import literal_eval

In [27]:
features = pd.read_csv('features.csv')#, index_col='id')
features.head()

Unnamed: 0,id,바디감,신맛,단맛,쓴맛,타입_디카페인,타입_블렌드,타입_싱글오리진,지속가능성_0,지속가능성_공정무역,...,로스팅 포인트_라이트미디엄,로스팅 포인트_미디엄,로스팅 포인트_미디엄다크,꽃,과일,허브,달콤함,고소함,향료_풍미,초콜릿
0,43,4,2,4,3,0,1,0,1,0,...,0,0,1,0,0,0,1,0,1,0
1,3896,4,2,4,3,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
2,3458,3,4,5,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
3,22,4,1,3,4,0,1,0,0,1,...,0,0,0,0,0,0,1,1,0,1
4,180,5,3,5,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,1


In [28]:
features.columns

Index(['id', '바디감', '신맛', '단맛', '쓴맛', '타입_디카페인', '타입_블렌드', '타입_싱글오리진',
       '지속가능성_0', '지속가능성_공정무역', '지속가능성_유기농', '지속가능성_직접무역', '로스팅 포인트_다크',
       '로스팅 포인트_라이트', '로스팅 포인트_라이트미디엄', '로스팅 포인트_미디엄', '로스팅 포인트_미디엄다크', '꽃',
       '과일', '허브', '달콤함', '고소함', '향료_풍미', '초콜릿'],
      dtype='object')

In [31]:
features = features.loc[:, features.columns!='id']

In [32]:
embeddings = features.values
embeddings.shape

(559, 23)

In [33]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 1.3.2.


In [34]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity_matrix = cosine_similarity(embeddings, embeddings)
cosine_similarity_matrix.shape

(559, 559)

In [35]:
cosine_similarity_matrix[:2,:2]

array([[1.        , 0.95938348],
       [0.95938348, 1.        ]])

In [36]:
df_copy = df.copy()
#df_copy = df_copy.set_index('id')

In [37]:
def most_similar(coffee_id, top_n=10):
    idx = df[df['id'] == coffee_id].index[0]
    df_copy['cosine_similarity'] = cosine_similarity_matrix[idx]
    return df_copy.sort_values(by='cosine_similarity', ascending=False)[:top_n]

In [38]:
most_similar(12, top_n=5)

Unnamed: 0,id,이름,로스터리,타입,로스팅 포인트,지속가능성,컵 노트,바디감,신맛,단맛,...,유통기한,제조일자,내용량,보관 방법,원재료 및 함량,제품문의 관련 주소 및 전화번호,카페인,로스터리ID,price,cosine_similarity
159,168,블렌드 A,리버벨,블렌드,미디엄다크,0,"['밀크 초콜릿', '견과류']",4,2,4,...,제조일부터 1년 이내,별도 표기,150g / 1kg,상온 보관,원두커피 100%,코케 고객센터 070-4647-1868,1,26,13000,1.0
196,12,플레이어,스티머스 커피팩토리,블렌드,미디엄다크,0,"['초콜릿', '견과류']",4,2,4,...,제조일로부터 2달 이내,상품 뒷면 참조,200g / 1kg,없음,COLOMBIA 40% + INDIA 30% + Ethiopia 20% + Braz...,코케 고객센터 070-4647-1868,1,46,13000,1.0
129,1342,F 블렌드,리플로우 커피 로스터스,블렌드,미디엄다크,0,"['견과류', '밀크 초콜릿']",4,2,4,...,제조일로부터 1년,별도표기,200g / 1kg,서늘한 곳에 보관,"커피 원두(브라질 60%, 콜롬비아 40%)",코케 고객센터 070-4647-1868,1,29,9200,0.992781
543,191,리들리,식물학 커피로스터스,블렌드,미디엄다크,0,"['구운 견과류', '아몬드 초콜릿', '몰트']",4,2,4,...,제조일로부터 1년,제조일 별도 표기(제품 뒷면),250g / 1kg,직사광선을 피하고 밀봉하여 건조하고 서늘한 곳에 보관,커피원두 100%,코케 고객센터 070-4647-1868,1,48,15000,0.990148
522,11,엘 카미노,피어 커피로스터스,블렌드,미디엄다크,0,"['견과류', '초콜릿']",3,1,4,...,제조일로부터 1년,상품후면표기,1000g,직사광선을 피하고 서늘한 곳에 보관하십시오,커피원두 100%,코케 고객센터 070-4647-1868,1,104,14000,0.98387


***
other similarity calculation methods
* Euclidean distance
***

In [5]:
from sklearn.metrics.pairwise import euclidean_distances
features = pd.read_csv('features.csv')
features = features.loc[:, features.columns!='id']
features.head(2)

Unnamed: 0,바디감,신맛,단맛,쓴맛,타입_디카페인,타입_블렌드,타입_싱글오리진,지속가능성_0,지속가능성_공정무역,지속가능성_유기농,...,로스팅 포인트_라이트미디엄,로스팅 포인트_미디엄,로스팅 포인트_미디엄다크,꽃,과일,허브,달콤함,고소함,향료_풍미,초콜릿
0,4,2,4,3,0,1,0,1,0,0,...,0,0,1,0,0,0,1,0,1,0
1,4,2,4,3,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0


In [7]:
embeddings = features.values
euclidean_distance_matrix = euclidean_distances(embeddings, embeddings)
print('embeddings : ', embeddings.shape)
print('matrix : ', euclidean_distance_matrix.shape)

embeddings :  (559, 23)
matrix :  (559, 559)


In [11]:
df = pd.read_csv('wondoofin.csv')

In [12]:
df_copy = df.copy()

In [15]:
def calculate_euclidean_distances(coffee_id, top_n=10):
    idx = df[df['id'] == coffee_id].index[0]

    df_copy['euclidean_distance'] = euclidean_distance_matrix[idx]
    #result_df = df_copy.sort_values(by='euclidean_distance', ascending=True)[:top_n]
    return df_copy.sort_values(by='euclidean_distance', ascending=True)[:top_n] #result_df['id'].tolist()

In [17]:
calculate_euclidean_distances(12, top_n=5)

Unnamed: 0,id,이름,로스터리,타입,로스팅 포인트,지속가능성,컵 노트,바디감,신맛,단맛,...,유통기한,제조일자,내용량,보관 방법,원재료 및 함량,제품문의 관련 주소 및 전화번호,카페인,로스터리ID,price,euclidean_distance
159,168,블렌드 A,리버벨,블렌드,미디엄다크,0,"['밀크 초콜릿', '견과류']",4,2,4,...,제조일부터 1년 이내,별도 표기,150g / 1kg,상온 보관,원두커피 100%,코케 고객센터 070-4647-1868,1,26,13000,0.0
196,12,플레이어,스티머스 커피팩토리,블렌드,미디엄다크,0,"['초콜릿', '견과류']",4,2,4,...,제조일로부터 2달 이내,상품 뒷면 참조,200g / 1kg,없음,COLOMBIA 40% + INDIA 30% + Ethiopia 20% + Braz...,코케 고객센터 070-4647-1868,1,46,13000,0.0
543,191,리들리,식물학 커피로스터스,블렌드,미디엄다크,0,"['구운 견과류', '아몬드 초콜릿', '몰트']",4,2,4,...,제조일로부터 1년,제조일 별도 표기(제품 뒷면),250g / 1kg,직사광선을 피하고 밀봉하여 건조하고 서늘한 곳에 보관,커피원두 100%,코케 고객센터 070-4647-1868,1,48,15000,1.0
129,1342,F 블렌드,리플로우 커피 로스터스,블렌드,미디엄다크,0,"['견과류', '밀크 초콜릿']",4,2,4,...,제조일로부터 1년,별도표기,200g / 1kg,서늘한 곳에 보관,"커피 원두(브라질 60%, 콜롬비아 40%)",코케 고객센터 070-4647-1868,1,29,9200,1.0
331,217,77도 블렌드,55도 커피로스터스,블렌드,미디엄다크,0,"['견과류', '초콜릿', '황설탕']",4,1,4,...,제조일로부터24개월,전면 별도 표기,200g / 1kg,"직사광선을 피해 온도, 습도가 낮으며 통풍이 잘 되는 곳에 보관하십시오.",커피원두100%,코케 고객센터 070-4647-1868,1,2,10000,1.414214


***
사용자 입력값으로 추천
***

In [11]:
import pandas as pd

In [185]:
df = pd.read_csv('features.csv')#, index_col='id')
df.head()

Unnamed: 0,id,바디감,신맛,단맛,쓴맛,타입_디카페인,타입_블렌드,타입_싱글오리진,지속가능성_0,지속가능성_공정무역,...,로스팅 포인트_라이트미디엄,로스팅 포인트_미디엄,로스팅 포인트_미디엄다크,꽃,과일,허브,달콤함,고소함,향료_풍미,초콜릿
0,43,4,2,4,3,0,1,0,1,0,...,0,0,1,0,0,0,1,0,1,0
1,3896,4,2,4,3,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
2,3458,3,4,5,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
3,22,4,1,3,4,0,1,0,0,1,...,0,0,0,0,0,0,1,1,0,1
4,180,5,3,5,1,0,0,1,1,0,...,0,0,1,0,1,0,0,0,0,1


In [186]:
features = df[['바디감', '신맛', '단맛', '쓴맛', '타입_디카페인', '타입_블렌드', '꽃',
       '과일', '허브', '달콤함', '고소함', '향료_풍미', '초콜릿']] # '타입_싱글오리진'
column_mapping = {'바디감' : 'body', '신맛' : 'sour', '단맛' : 'sweet', '쓴맛' : 'bitter', '타입_디카페인' : 'caf', '타입_블렌드' : 'blend', '향료_풍미' : '향료', '달콤함' : '달콤한', '고소함':'고소한'} # 'notes'

# Rename the columns using the rename() method
features.rename(columns=column_mapping, inplace=True)
features.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.rename(columns=column_mapping, inplace=True)


Unnamed: 0,body,sour,sweet,bitter,caf,blend,꽃,과일,허브,달콤한,고소한,향료,초콜릿
0,4,2,4,3,0,1,0,0,0,1,0,1,0
1,4,2,4,3,1,0,0,0,0,0,0,0,0


In [187]:
# add user preference : {'caf': None, 'blend': None, 'notes': [], 'sour': None, 'sweet': None, 'bitter': None, 'body': None}
new_row_dict = {'caf': 0, 'blend': 1, 'notes': ['꽃', '과일', '달콤한'], 'sour': 4, 'sweet': 3, 'bitter': 2, 'body': 3}

# Create a new dictionary
new_dict = new_row_dict.copy()

# Add other keys with initial values
new_keys = ['꽃', '과일', '달콤한', '허브', '고소한', '향료', '초콜릿']

for key in new_keys:
    new_dict[key] = 0

# Iterate through the 'notes' list and add new keys to 'new_dict'
for note in new_row_dict.get('notes', []):
    new_dict[note] = 1

del new_dict['notes']

# new_row = pd.DataFrame.from_dict(new_dict, orient='index').T
# desired_order = ['body', 'sour', 'sweet', 'bitter', 'caf', 'blend', '꽃', '과일', '허브', '달콤한', '고소한', '향료', '초콜릿']
# new_row = new_row[desired_order]

# Print the DataFrame
# print(new_row)
# Using loc indexer
df.loc[len(df)] = new_dict
features.loc[len(features)] = new_dict

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.loc[len(features)] = new_dict


In [188]:
embeddings = features.values
embeddings.shape

(560, 13)

In [189]:
import numpy as np

# Check for NaN values
nan_mask = np.isnan(embeddings)

# print(nan_mask)
if nan_mask.any():
    print('there is nan')

In [190]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity_matrix = cosine_similarity(embeddings, embeddings)
cosine_similarity_matrix.shape

(560, 560)

In [139]:
def most_similar(new_row, existing_rows, top_n=10):
    # Calculate cosine similarity between the new row and existing rows
    cosine_sim = cosine_similarity([new_row], existing_rows)

    # Create a DataFrame with cosine similarities
    result_df = pd.DataFrame({'cosine_similarity': cosine_sim[0]})

    # Sort by cosine similarity in descending order and get the top_n similar items
    result_df['id'] = result_df.index
    result_df = result_df.sort_values(by='cosine_similarity', ascending=False)[:top_n]

    return result_df['id'].tolist()

In [207]:
def cos_recommendation(top_n=10):
    # embeddings = features.values
    # cosine_similarity_matrix = cosine_similarity(embeddings, embeddings)
    
    df_copy = df.copy()
    
    df_copy['cosine_similarity'] = cosine_similarity_matrix[-1]
    # result_df = df_copy.sort_values(by='cosine_similarity', ascending=False)[:top_n]
    result_df = df_copy[df_copy['id'].notna()].sort_values(by='cosine_similarity', ascending=False)[:top_n]
    return result_df['id'].tolist()

In [192]:
new_row

array([[3, 4, 3, 2, 0, 1, 1, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [203]:
df_copy = df.copy()
df_copy['cosine_similarity'] = cosine_similarity_matrix[-1]
    # result_df = df_copy.sort_values(by='cosine_similarity', ascending=False)[:top_n]
result_df = df_copy[df_copy['id'].notna()].sort_values(by='cosine_similarity', ascending=False)[:5]
result_df['id'].tolist()

[53.0, 56.0, 13.0, 1341.0, 1048.0]

In [212]:
df = pd.read_csv('features.csv')
features = df.loc[:, df.columns != 'id']

def cos_recommendation(user_input, top_n=5):
    column_mapping = {'바디감': 'body', '신맛': 'sour', '단맛': 'sweet', '쓴맛': 'bitter', '타입_디카페인': 'caf', '타입_블렌드': 'blend',
                      '향료_풍미': '향료', '달콤함': '달콤한', '고소함': '고소한'}  # 'notes'
    df.rename(columns=column_mapping, inplace=True)
    
    # Create a new dictionary
    new_dict = user_input.copy()
    # Add other keys with initial values
    new_keys = ['꽃', '과일', '달콤한', '허브', '고소한', '향료', '초콜릿']
    for key in new_keys:
        new_dict[key] = 0
    # Iterate through the 'notes' list and add new keys to 'new_dict'
    for note in user_input.get('notes', []):
        new_dict[note] = 1
    del new_dict['notes']
    df.loc[len(df)] = new_dict

    features_df = df[
        ['body', 'sour', 'sweet', 'bitter', 'caf', 'blend', '꽃', '과일', '허브', '달콤한', '고소한', '향료', '초콜릿']]  # '타입_싱글오리진'
    
    
    embeddings = features_df.values
    cosine_similarity_matrix = cosine_similarity(embeddings, embeddings)
    df_copy = df.copy()
    df_copy['cosine_similarity'] = cosine_similarity_matrix[-1]
    result_df = df_copy[df_copy['id'].notna()].sort_values(by='cosine_similarity', ascending=False)[:top_n]
    return result_df['id'].tolist()

In [213]:
new_row_dict = {'caf': 0, 'blend': 1, 'notes': ['꽃', '과일', '달콤한'], 'sour': 4, 'sweet': 3, 'bitter': 2, 'body': 3}
cos_recommendation(new_row_dict, 3)

[53.0, 56.0, 13.0]