In [None]:
#!pip install chromadb sentence-transformers chromadb

In [1]:
import pandas as pd
import torch

In [2]:
article_info = pd.read_csv('data/article_info.csv')
view_log = pd.read_csv('data/view_log.csv')

In [8]:
from  langchain.schema import Document
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings

In [4]:
def embed_file(CACHE_DIR, model_name = 'BAAI/bge-m3'):
    
    
    model_kwargs = {
        # "device": "mps"
        "device": "cpu"
    }
    encode_kwargs = {"normalize_embeddings": True}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs,
        cache_folder = CACHE_DIR
    )
    
    return embeddings

In [5]:
embedding = embed_file('pretrained/')

  warn_deprecated(
  from .autonotebook import tqdm as notebook_tqdm


In [6]:
docs = [
    Document(page_content = row['Title'], metadata = {'id' : row['articleID']}) for idx, row in article_info.iterrows()
]

In [11]:
vectorstore = Chroma.from_documents(docs, embedding) # 시간걸림

In [12]:
from tqdm import tqdm

# 기사별 상위 5개 기사와 점수(유사도) 저장
reco_dics = {id : {'docs' : [], 'scores' : []} for id in view_log['articleID'].unique()}
for article_id in tqdm(view_log['articleID'].unique(), total = len(view_log['articleID'].unique())):
    title = article_info[article_info['articleID'] == article_id]['Title'].iloc[0]
    for relevance_data in vectorstore.similarity_search_with_relevance_scores(title, k =5):
        doc, score = relevance_data
        if  article_id != doc.metadata['id']:
            
            reco_dics[article_id]['docs'].append(doc.metadata['id'])
            reco_dics[article_id]['scores'].append(score)

100%|██████████| 2879/2879 [14:53<00:00,  3.22it/s]


In [28]:
reco_dics['ARTICLE_0661']

{'docs': ['ARTICLE_2373', 'ARTICLE_0814', 'ARTICLE_2560', 'ARTICLE_2850'],
 'scores': [0.5532439958404025,
  0.4931458351712029,
  0.48120356779248963,
  0.4781335070532591]}

In [26]:
from collections import defaultdict
# 유저가 봤던 기사들의 상위 5개 목록을 모두 가져와 평균 계산
person_reco_dics = {id : [] for id in view_log['userID'].unique()}
for user_id in tqdm(view_log['userID'].unique(), total = len(view_log['userID'].unique())):
    
    view_articles = view_log[view_log['userID'] == user_id]['articleID'].values
    
    reco_list_dict = defaultdict(int)
    for article_id in view_articles:
        for doc, score in zip(reco_dics[article_id]['docs'], reco_dics[article_id]['scores']):
            reco_list_dict[doc] += score

    # 상위 5개 목록만 각 유저 아이디에 추가
    person_reco_dics[user_id] = sorted(reco_list_dict, key=lambda x: x[1], reverse=True)[:5]


100%|██████████| 1415/1415 [00:02<00:00, 609.56it/s]


In [27]:
person_reco_dics

{'USER_0000': ['ARTICLE_2373',
  'ARTICLE_0814',
  'ARTICLE_2560',
  'ARTICLE_2850',
  'ARTICLE_1965'],
 'USER_0001': ['ARTICLE_1902',
  'ARTICLE_1227',
  'ARTICLE_0418',
  'ARTICLE_0801',
  'ARTICLE_0094'],
 'USER_0002': ['ARTICLE_2554',
  'ARTICLE_1608',
  'ARTICLE_0972',
  'ARTICLE_1427',
  'ARTICLE_2978'],
 'USER_0003': ['ARTICLE_0476',
  'ARTICLE_0977',
  'ARTICLE_0494',
  'ARTICLE_1160',
  'ARTICLE_2865'],
 'USER_0004': ['ARTICLE_2834',
  'ARTICLE_0351',
  'ARTICLE_2642',
  'ARTICLE_1841',
  'ARTICLE_1566'],
 'USER_0005': ['ARTICLE_2658',
  'ARTICLE_0274',
  'ARTICLE_0036',
  'ARTICLE_2133',
  'ARTICLE_2873'],
 'USER_0006': ['ARTICLE_0163',
  'ARTICLE_1598',
  'ARTICLE_1966',
  'ARTICLE_1842',
  'ARTICLE_1757'],
 'USER_0007': ['ARTICLE_2197',
  'ARTICLE_0433',
  'ARTICLE_0979',
  'ARTICLE_0045',
  'ARTICLE_2684'],
 'USER_0008': ['ARTICLE_2505',
  'ARTICLE_0914',
  'ARTICLE_2497',
  'ARTICLE_2389',
  'ARTICLE_1481'],
 'USER_0009': ['ARTICLE_2199',
  'ARTICLE_0849',
  'ARTICLE_0323