# 아이템 기반
0.3143420016   
0.2953427896

In [None]:
article_info = pd.read_csv('/content/article_info.csv')
view_log = pd.read_csv('/content/view_log.csv')
sample_submission = pd.read_csv('/content/sample_submission.csv')

In [None]:
article_info.rename(columns={'userID':'reporterID'}, inplace=True)
view_log.rename(columns={'userID':'readerID'}, inplace = True)

In [None]:
# 사용자-기사 행렬 생성
user_article_matrix = view_log.groupby(['articleID', 'readerID']).size().unstack(fill_value=0)

In [None]:
#코사인유사도
from sklearn.metrics.pairwise import cosine_similarity
item_based_collabor = cosine_similarity(user_article_matrix)
item_based_collabor

array([[1.        , 0.04811252, 0.        , ..., 0.02868877, 0.06154575,
        0.        ],
       [0.04811252, 1.        , 0.        , ..., 0.        , 0.05330018,
        0.15075567],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.02868877, 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.06154575, 0.05330018, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.15075567, 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [None]:
# 기사 ID -> 인덱스 매핑
article_id_to_idx = {article: idx for idx, article in enumerate(user_article_matrix.index)}

def recommend_articles(user_id, view_log, article_similarity, article_id_to_idx, top_n=5):
    user_articles = view_log[view_log['readerID'] == user_id]['articleID'].tolist()

    # 조회한 기사와 유사한 기사 추출
    sim_scores = pd.Series(dtype=float)
    for article in user_articles:
        if article in article_id_to_idx:
            article_idx = article_id_to_idx[article]
            if article_idx < article_similarity.shape[0]:  # 인덱스 범위 확인
                sim_scores = sim_scores.add(pd.Series(article_similarity[article_idx]), fill_value=0)

    # 조회한 기사는 제외
    #sim_scores = sim_scores.drop([article_id_to_idx[article] for article in user_articles if article in article_id_to_idx and article_id_to_idx[article] < article_similarity.shape[0]], errors='ignore')

    # 유사도 높은 기사 추천
    top_articles = sim_scores.sort_values(ascending=False).head(top_n).index
    top_article_ids = [user_article_matrix.index[idx] for idx in top_articles]

    return top_article_ids

# 각 사용자에 대해 추천 기사 생성
recommendations = []
for user_id in sample_submission['userID'].unique():
    recommended_articles = recommend_articles(user_id, view_log, item_based_collabor, article_id_to_idx, top_n=5)
    for article_id in recommended_articles:
        recommendations.append({'userID': user_id, 'articleID': article_id})

# 추천 결과 저장
submission_df = pd.DataFrame(recommendations)
submission_df.to_csv('recommendations.csv', index=False)

# 잠재요인-튜닝
0.0448778566  
0.0448345154

In [None]:
article_info.rename(columns={'userID':'reporterID'}, inplace=True)
view_log.rename(columns={'userID':'readerID'}, inplace = True)

In [None]:
from sklearn.metrics import mean_squared_error
# 사용자-기사 행렬 생성
user_article_matrix = view_log.groupby(['readerID', 'articleID']).size().unstack(fill_value=0)

# 훈련 데이터와 검증 데이터로 나누기
train_data, val_data = train_test_split(user_article_matrix, test_size=0.2, random_state=42)

def get_rmse(R, P, Q, non_zeros):
    full_pred_matrix = np.dot(P, Q.T)
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)
    return rmse

def matrix_factorization(R, K, steps=200, learning_rate=0.01, r_lambda=0.01):
    num_users, num_items = R.shape
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))
    non_zeros = [(i, j, R[i, j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]

    for step in range(steps):
        for i, j, r in non_zeros:
            eij = r - np.dot(P[i, :], Q[j, :].T)
            P[i, :] += learning_rate * (eij * Q[j, :] - r_lambda * P[i, :])
            Q[j, :] += learning_rate * (eij * P[i, :] - r_lambda * Q[j, :])

    return P, Q

def calculate_recall_at_k(val_data, pred_matrix, k=5):
    num_users = val_data.shape[0]
    recalls = []

    for user in range(num_users):
        actual_items = val_data.columns[val_data.iloc[user, :].to_numpy().nonzero()]
        predicted_items = np.argsort(pred_matrix[user, :])[::-1][:k]
        predicted_items = val_data.columns[predicted_items]
        hit_count = len(set(actual_items) & set(predicted_items))
        recall = hit_count / len(actual_items) if len(actual_items) > 0 else 0
        recalls.append(recall)

    return np.mean(recalls)

# 하이퍼파라미터 튜닝을 위한 파라미터 범위 설정
K_values = [10, 30]
steps_values = [100, 200, 300]
learning_rate_values = [0.01, 0.001]
r_lambda_values = [0.01]

best_params = {}
best_recall = -1

for K in K_values:
    for steps in steps_values:
        for learning_rate in learning_rate_values:
            for r_lambda in r_lambda_values:
                P, Q = matrix_factorization(train_data.values, K, steps, learning_rate, r_lambda)
                pred_matrix = np.dot(P, Q.T)
                recall = calculate_recall_at_k(val_data, pred_matrix, k=5)
                print(f'K: {K}, steps: {steps}, learning_rate: {learning_rate}, r_lambda: {r_lambda}, recall: {recall}')

                if recall > best_recall:
                    best_recall = recall
                    best_params = {'K': K, 'steps': steps, 'learning_rate': learning_rate, 'r_lambda': r_lambda}

print("Best parameters:", best_params)
print("Best recall:", best_recall)

# 최적의 하이퍼파라미터로 전체 데이터에 대해 모델 학습
P, Q = matrix_factorization(user_article_matrix.values, best_params['K'], best_params['steps'], best_params['learning_rate'], best_params['r_lambda'])
pred_matrix = np.dot(P, Q.T)
pred_matrix = pd.DataFrame(data=pred_matrix, index=user_article_matrix.index, columns=user_article_matrix.columns)

# 사용자에게 추천할 기사를 추출하는 함수
def recommend_articles(reader_id, matrix, top_n=5):
    scores = matrix.loc[reader_id]
    top_articles = scores.sort_values(ascending=False).head(top_n).index
    return top_articles.tolist()

# 각 사용자에 대해 추천 기사 생성
recommendations = []
for reader_id in sample_submission['userID'].unique():
    recommended_articles = recommend_articles(reader_id, pred_matrix, top_n=5)
    for article_id in recommended_articles:
        recommendations.append({'userID': reader_id, 'articleID': article_id})

# 추천 결과 저장
submission_df = pd.DataFrame(recommendations)
submission_df.to_csv('잠재요인_행렬분해_튜닝.csv', index=False)

K: 10, steps: 100, learning_rate: 0.01, r_lambda: 0.01, recall: 0.0021367762899136226
K: 10, steps: 100, learning_rate: 0.001, r_lambda: 0.01, recall: 0.007332597247022509
K: 10, steps: 200, learning_rate: 0.01, r_lambda: 0.01, recall: 0.0023984729343515723
K: 10, steps: 200, learning_rate: 0.001, r_lambda: 0.01, recall: 0.0046428485731918505
K: 10, steps: 300, learning_rate: 0.01, r_lambda: 0.01, recall: 0.0023895112532487865
K: 10, steps: 300, learning_rate: 0.001, r_lambda: 0.01, recall: 0.004208366675003056
K: 30, steps: 100, learning_rate: 0.01, r_lambda: 0.01, recall: 0.0031694397150048097
K: 30, steps: 100, learning_rate: 0.001, r_lambda: 0.01, recall: 0.01741466532838933
K: 30, steps: 200, learning_rate: 0.01, r_lambda: 0.01, recall: 0.003136554862598045
K: 30, steps: 200, learning_rate: 0.001, r_lambda: 0.01, recall: 0.006171342465672139
K: 30, steps: 300, learning_rate: 0.01, r_lambda: 0.01, recall: 0.0031525438621663416
K: 30, steps: 300, learning_rate: 0.001, r_lambda: 0.01

# 컨텐츠 기반  
- 번역 후 클렌징한 뒤 tfidf   
	0.3273443656  
  0.3171040189

In [None]:
# 구글 번역기 설치 (처음에만 실행)
!pip install googletrans==4.0.0-rc1

Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting hstspreload (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading hstspreload-2024.7.1-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting chardet==3.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting idna==2.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Downloading idna-2.10-py2.py3-none-any.whl (58 kB)
[2K     [90m

In [None]:
import pandas as pd
from googletrans import Translator

translator = Translator()

def translate_text(text, translator):
    try:
        return translator.translate(text, dest='en', src='auto').text
    except Exception as e:
        return text  # translation failed, return the original text

# 대상 언어를 리스트로 정의
target_languages = ['pt', 'la', 'es', 'ja']

# 'Language' 열이 target_languages에 포함된 행에 대해 번역 수행
mask = article_info['Language'].isin(target_languages)
article_info.loc[mask, 'Title'] = article_info.loc[mask, 'Title'].apply(translate_text, translator=translator)

In [None]:
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

import re
import string
from nltk import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
## Title 텍스트 클렌징해보자
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because",
                        "could've": "could have", "couldn't": "could not", "didn't": "did not",
                        "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not",
                        "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is",
                        "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have"}
def _get_contractions(contraction_dict):
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

def replace_contractions(text):
        contractions, contractions_re = _get_contractions(contraction_dict)
        def replace(match):
            return contractions[match.group(0)]
        return contractions_re.sub(replace, text)

def clean_text(txt):
  # 1. 소문자 변환
  txt = txt.lower()

  # replace contractions
  txt = replace_contractions(txt)

  # 2. HTML 태그 제거
  txt = re.sub(r'<.*?>', ' ', txt)

  # 3. 구두점 제거 #!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
  txt  = "".join([char for char in txt if char not in string.punctuation])

  # 4. 숫자 제거
  txt = re.sub('[0-9]+', '', txt)

  # 5. 공백 두 개 제거
  txt = re.sub(r"\s{2,}", " ", txt)

  # 6. 단어 토큰화
  words = txt.split()

  # 6. 불용어 제거
  stop_words = set(stopwords.words('english'))
  words = [w for w in words if not w in stop_words]
  #stemmer = stemmer_dict[language]
  #words = [stemmer.stem(word) for word in words if word not in stop_words]

  return ' '.join(words)

article_info['Title'] = article_info['Title'].apply(lambda txt: clean_text(txt))

In [None]:
# tfidf 기반 벡터화
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(min_df=0, ngram_range=(1,2))
content_mat = tfidf_vect.fit_transform(article_info['Title'])

# 코사인 유사도
from sklearn.metrics.pairwise import cosine_similarity
content_similarity = cosine_similarity(content_mat, content_mat)
#print(content_similarity)

# 기사 ID -> 인덱스 매핑
article_id_to_idx = {article: idx for idx, article in enumerate(article_info['articleID'])}

def recommend_articles(user_id, view_log, article_similarity, article_id_to_idx, top_n=5):
    user_articles = view_log[view_log['userID'] == user_id]['articleID'].tolist()

    # 조회한 기사와 유사한 기사 추출
    sim_scores = pd.Series(dtype=float)
    for article in user_articles:
        if article in article_id_to_idx:
            article_idx = article_id_to_idx[article]
            sim_scores = sim_scores.add(pd.Series(content_similarity[article_idx]), fill_value=0)

    # 조회한 기사는 제외
    # sim_scores = sim_scores.drop([article_id_to_idx[article] for article in user_articles if article in article_id_to_idx], errors='ignore')

    # 유사도 높은 기사 추천
    top_articles = sim_scores.sort_values(ascending=False).head(top_n).index
    top_article_ids = [list(article_id_to_idx.keys())[idx] for idx in top_articles]

    return top_article_ids

# 각 사용자에 대해 추천 기사 생성
recommendations = []
for user_id in sample_submission['userID'].unique():
    recommended_articles = recommend_articles(user_id, view_log, content_similarity, article_id_to_idx, top_n=5)
    for article_id in recommended_articles:
        recommendations.append({'userID': user_id, 'articleID': article_id})

# 추천 결과 저장
submission_df = pd.DataFrame(recommendations)
submission_df.to_csv('제목 번역 후 클렌징.csv', index=False)