<a href="https://colab.research.google.com/github/KJM94/Single_project/blob/main/%EC%9B%B9%20%EA%B8%B0%EC%82%AC%20%EC%B6%94%EC%B2%9C%20AI%20%EA%B2%BD%EC%A7%84%EB%8C%80%ED%9A%8C/Untitled7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse.linalg import svds
import re

# 데이터 로드
article_info = pd.read_csv('./article_info.csv')
view_log = pd.read_csv('./view_log.csv')

# 텍스트 전처리
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

article_info['Content'] = article_info['Content'].apply(preprocess_text)

# TF-IDF 벡터화
tfidf = TfidfVectorizer(stop_words='english', max_df=0.85, min_df=2, ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(article_info['Content'])

# 기사 ID를 인덱스와 매핑
indices = pd.Series(article_info.index, index=article_info['articleID']).drop_duplicates()

# 코사인 유사도 계산
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


In [None]:
# 사용자-기사 상호작용 매트릭스 생성
interaction_matrix = view_log.pivot_table(index='userID', columns='articleID', aggfunc='size', fill_value=0)
interaction_matrix = interaction_matrix.astype(np.float64)

# SVD 수행 (차원 수 조정)
U, sigma, Vt = svds(interaction_matrix, k=50)
sigma = np.diag(sigma)

# 예측 평점 계산
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings = pd.DataFrame(predicted_ratings, columns=interaction_matrix.columns)


In [None]:
# 협업 필터링 기반 추천
def collaborative_filtering(user_id, num_recommendations=5):
    user_idx = interaction_matrix.index.get_loc(user_id)
    sorted_user_predictions = predicted_ratings.iloc[user_idx].sort_values(ascending=False)
    return sorted_user_predictions.index[:num_recommendations]

# 콘텐츠 기반 추천
def get_content_based_recommendations(article_id, num_recommendations=5):
    idx = indices[article_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    article_indices = [i[0] for i in sim_scores]
    return article_info['articleID'].iloc[article_indices]

# 하이브리드 추천 시스템
def hybrid_recommendation_system(user_id, num_recommendations=5, weight_cf=0.6, weight_cb=0.4):
    cf_recommendations = collaborative_filtering(user_id, num_recommendations * 2)
    viewed_articles = view_log[view_log['userID'] == user_id]['articleID'].tolist()
    cb_recommendations = []
    for article_id in viewed_articles:
        cb_recommendations.extend(get_content_based_recommendations(article_id, num_recommendations=2))
    combined_recommendations = list(set(cf_recommendations).union(set(cb_recommendations)))
    combined_scores = {article: weight_cf for article in cf_recommendations}
    for article in cb_recommendations:
        if article in combined_scores:
            combined_scores[article] += weight_cb
        else:
            combined_scores[article] = weight_cb
    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    return [article for article, score in sorted_recommendations if article not in viewed_articles][:num_recommendations]

# 추천 결과 생성
results = []
for user_id in view_log['userID'].unique():
    recommendations = hybrid_recommendation_system(user_id)
    for article_id in recommendations:
        results.append([user_id, article_id])

# 결과 저장
results_df = pd.DataFrame(results, columns=['userID', 'articleID'])
results_df.to_csv('./hybrid_recommendations_optimized.csv', index=False)

# 결과 출력
print(results_df.head())


      userID     articleID
0  USER_0000  ARTICLE_1305
1  USER_0000  ARTICLE_0084
2  USER_0000  ARTICLE_2081
3  USER_0000  ARTICLE_0830
4  USER_0000  ARTICLE_0287


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse.linalg import svds

# 데이터 로드
article_info = pd.read_csv('./article_info.csv')
view_log = pd.read_csv('./view_log.csv')

# 텍스트 전처리
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

article_info['Content'] = article_info['Content'].apply(preprocess_text)

# TF-IDF 벡터화
tfidf = TfidfVectorizer(stop_words='english', max_df=0.85, min_df=2, ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(article_info['Content'])

# 코사인 유사도 계산
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# 사용자-기사 상호작용 매트릭스 생성
interaction_matrix = view_log.pivot_table(index='userID', columns='articleID', aggfunc='size', fill_value=0)
interaction_matrix = interaction_matrix.astype(np.float64)

# SVD 수행 (차원 수 조정)
U, sigma, Vt = svds(interaction_matrix, k=50)
sigma = np.diag(sigma)

# 예측 평점 계산
predicted_ratings = np.dot(np.dot(U, sigma), Vt)
predicted_ratings = pd.DataFrame(predicted_ratings, columns=interaction_matrix.columns)

# 협업 필터링 기반 추천
def collaborative_filtering(user_id, num_recommendations=5):
    user_idx = interaction_matrix.index.get_loc(user_id)
    sorted_user_predictions = predicted_ratings.iloc[user_idx].sort_values(ascending=False)
    return sorted_user_predictions.index[:num_recommendations]

# 콘텐츠 기반 추천
indices = pd.Series(article_info.index, index=article_info['articleID']).drop_duplicates()
def get_content_based_recommendations(article_id, num_recommendations=5):
    idx = indices[article_id]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]
    article_indices = [i[0] for i in sim_scores]
    return article_info['articleID'].iloc[article_indices]

# 하이브리드 추천 시스템
def hybrid_recommendation_system(user_id, num_recommendations=5, weight_cf=0.6, weight_cb=0.4):
    cf_recommendations = collaborative_filtering(user_id, num_recommendations * 2)
    viewed_articles = view_log[view_log['userID'] == user_id]['articleID'].tolist()
    cb_recommendations = []
    for article_id in viewed_articles:
        cb_recommendations.extend(get_content_based_recommendations(article_id, num_recommendations=2))
    combined_recommendations = list(set(cf_recommendations).union(set(cb_recommendations)))
    combined_scores = {article: weight_cf for article in cf_recommendations}
    for article in cb_recommendations:
        if article in combined_scores:
            combined_scores[article] += weight_cb
        else:
            combined_scores[article] = weight_cb
    sorted_recommendations = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    return [article for article, score in sorted_recommendations if article not in viewed_articles][:num_recommendations]

# 추천 결과 생성
results = []
for user_id in view_log['userID'].unique():
    recommendations = hybrid_recommendation_system(user_id)
    for article_id in recommendations:
        results.append([user_id, article_id])

# 결과 저장
results_df = pd.DataFrame(results, columns=['userID', 'articleID'])
results_df.to_csv('./hybrid_recommendations_optimized.csv', index=False)

# 결과 출력
print(results_df.head())


      userID     articleID
0  USER_0000  ARTICLE_1305
1  USER_0000  ARTICLE_0084
2  USER_0000  ARTICLE_2081
3  USER_0000  ARTICLE_0830
4  USER_0000  ARTICLE_0287
