In [183]:
import pandas as pd
import numpy as np

from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

view_log = pd.read_csv('data/view_log.csv')
view_log_train = view_log.copy()
article_info = pd.read_csv('data/article_info.csv')
submission = pd.read_csv('data/sample_submission.csv')

# 사용자별 유니크 국가 및 지역 목록 생성
user_countries = view_log.groupby('userID')['userCountry'].unique().apply(set)

# 트레이닝 데이터에 언어 정보 병합
view_log_with_lang = pd.merge(view_log, article_info[['articleID', 'Language']], on='articleID')




In [184]:
drop_idx = []

for user_id in view_log_with_lang['userID'].unique():
    user_writed_articles = article_info[article_info['userID'] == user_id]['articleID'].values
    user_view_log = view_log_with_lang[view_log_with_lang['userID'] == user_id]
    self_view_df = user_view_log[user_view_log['articleID'].isin(user_writed_articles)]
    self_ivew_list = self_view_df.index.to_list()
    remain_idx = self_view_df.groupby(['userID','articleID']).head(999).index.to_list()
    drop_idx += [idx for idx in self_ivew_list if idx not in remain_idx]
view_log_with_lang = view_log_with_lang.drop(index=drop_idx)

user_article_matrix = view_log_with_lang.groupby(['userID', 'articleID']).size().reset_index(name='visit_count')
user_article_matrix['visit_count'] = user_article_matrix['visit_count'].apply(lambda x: min(x, 12))
user_article_matrix['log_weight'] = user_article_matrix['visit_count'].apply(lambda x: np.log1p(x))

user_article_matrix = user_article_matrix.pivot(index='userID', columns='articleID', values='log_weight').fillna(0)
user_similarity = cosine_similarity(user_article_matrix)

mean_similarity = np.mean(user_similarity[user_similarity > 0])
additional_weight = mean_similarity * 0.79
user_index = user_article_matrix.index
for i, user_i in enumerate(user_index):
    countries_i = user_countries[user_i]
    for j, user_j in enumerate(user_index[i + 1:], start=i + 1):
        countries_j = user_countries[user_j]
        if countries_i & countries_j:
            user_similarity[i, j] -= additional_weight
            user_similarity[j, i] -= additional_weight                    

person_reco_dics = {id: [] for id in view_log['userID'].unique()}

In [185]:
def base_recoomand(user_article_matrix_train_random, user_similarity_train_random, person_reco_dics, df_return = False):
    def get_language_filtered_recommendations(user_article_matrix, user_similarity):
        user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T
        recommendations = []
        for idx, user in enumerate(user_article_matrix.index):
            
            sorted_indices = user_predicted_scores[idx].argsort()[::-1]
            
            #---------------------------------
            # 유사점 점수로 추천 리스트 만들기
            top5recommend = user_article_matrix.columns[sorted_indices][:5].tolist()
            for article in top5recommend[:5]:
                recommendations.append([user, article])
                
            # if len(person_reco_dics[user]):
            #     for embedding_reco in person_reco_dics[user]:
            #         if embedding_reco not in top5recommend[:4]:
            #             recommendations.append([user, embedding_reco])
            #             break
                
            # if len(recommendations) % 5 != 0:
            #     recommendations.append([user, top5recommend[-1]])
                
            
        return recommendations

    language_filtered_recommendations_original = get_language_filtered_recommendations(user_article_matrix_train_random, user_similarity_train_random)

    # Convert recommendations to DataFrame
    language_filtered_recommendations_original_df = pd.DataFrame(language_filtered_recommendations_original, columns=['userID', 'articleID'])

    if df_return:
        return language_filtered_recommendations_original_df
    return 

In [186]:
df = base_recoomand(user_article_matrix, user_similarity, person_reco_dics, df_return=True)

In [187]:
submission = pd.read_csv('submission_14.csv')
submission2 = pd.read_csv('submission_18.csv')

In [188]:
(submission['articleID'] != df['articleID']).sum(), (submission2['articleID'] != df['articleID']).sum(), (submission['articleID'] != submission2['articleID']).sum()

(439, 3187, 3092)

In [189]:
submission['articleID'] = df['articleID']

submission.to_csv('submission_24.csv', index=False)

In [187]:
## test

import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

view_log_train = pd.read_csv('data/view_log.csv')
article_info = pd.read_csv('data/article_info.csv')
submission = pd.read_csv('data/sample_submission.csv')

# 유저별로 방문 기록의 최대 20%를 넘지 않는 범위 내에서 1~10개의 기사를 랜덤하게 테스트 데이터로 선택
def select_random_articles(group):
    max_articles = max(1, min(5, int(len(group) * 0.2)))  # 최소 1개, 최대 10개 또는 전체의 20%
    num_articles = np.random.randint(1, max_articles + 1)  # 랜덤 선택 개수 결정
    return np.random.choice(group.index, num_articles, replace=False)

np.random.seed(42)

# np.random.seed(1111)
test_indices = view_log_train.drop_duplicates(subset=['userID',	'articleID', 'userRegion',	'userCountry']).groupby('userID').apply(select_random_articles).explode().astype(int)
test_data_random = view_log_train.loc[test_indices]
train_data_random = view_log_train.drop(test_indices)

drop_idx = []
for user_id in train_data_random['userID'].unique():
    user_writed_articles = article_info[article_info['userID'] == user_id]['articleID'].values
    user_view_log = train_data_random[train_data_random['userID'] == user_id]
    self_view_df = user_view_log[user_view_log['articleID'].isin(user_writed_articles)]
    self_ivew_list = self_view_df.index.to_list()
    remain_idx = self_view_df.groupby(['userID','articleID']).head(20).index.to_list()
    drop_idx += [idx for idx in self_ivew_list if idx not in remain_idx]
train_data_random = train_data_random.drop(index=drop_idx)


# 사용자별 유니크 국가 및 지역 목록 생성
user_countries = view_log_train.groupby('userID')['userCountry'].unique().apply(set)

# 테스트 데이터 준비
test_data_for_comparison = test_data_random[['userID', 'articleID']].drop_duplicates()


user_article_matrix_train_random = train_data_random.groupby(['userID', 'articleID']).size().reset_index(name='visit_count')
user_article_matrix_train_random['visit_count'] = user_article_matrix_train_random['visit_count'].apply(lambda x: min(x, 10))
user_article_matrix_train_random['log_weight'] = user_article_matrix_train_random['visit_count'].apply(lambda x: np.log1p(x))
user_article_matrix_train_random = user_article_matrix_train_random.pivot(index='userID', columns='articleID', values='log_weight').fillna(0)

# 사용자 간의 유사성 계산
user_similarity_train_random = cosine_similarity(user_article_matrix_train_random)


mean_similarity = np.mean(user_similarity_train_random[user_similarity_train_random > 0])

# 평균의 10%를 추가 가중치로 설정
additional_weight = mean_similarity * 0.8
print('Additional_CONTRY_WEIGHT : ', additional_weight)
user_index = user_article_matrix_train_random.index
for i, user_i in enumerate(user_index):
    countries_i = user_countries[user_i]
    for j, user_j in enumerate(user_index[i+1:], start=i+1):  # 대칭성을 고려하여 j를 i+1부터 시작
        countries_j = user_countries[user_j]
        if countries_i & countries_j:  # 두 사용자가 하나 이상의 공통 국가를 가지고 있는 경우
            user_similarity_train_random[i, j] -= (additional_weight)
            user_similarity_train_random[j, i] -= (additional_weight)  # 유사성 행렬은 대칭이므로




Additional_CONTRY_WEIGHT :  0.06945662379096092


In [188]:
base_recoomand(user_article_matrix_train_random, user_similarity_train_random, df_return = False)

0.10837755875663381


|no|score|score2|score3|socre4|
|---|------|------|-----|----|
|1|0.10842810209754863 | 0.11132170836492293|0.10674753601213041|0.10607783674500884|
|2|0.1083522870861764 | 0.11237048268890576 |0.11148597422289615|0.10985595147839272
|3|0.10803639120545869 | 0.11511245893353549|0.10816274955774578|0.10881981298963861
|4|0.1090725296942128 | 0.11536517563810969|0.10839019459186253|0.10812484205205965
|5|0.10903462218852666 | 0.11328026282537278|0.10820065706343189|0.10812484205205965
|6|0.10903462218852666 | 0.11328026282537278|0.10839019459186253|0.10812484205205965
|7|0.10903462218852666 | 0.11328026282537278|0.10839019459186253|0.10837755875663381
|8|0.10827647207480415 | 0.11328026282537278|0.10839019459186253|0.10818802122820317
|9|0.1086555471316654 |  0.11328026282537278|0.10839019459186253|0.10818802122820317
|10|0.10941369724538792 | 0.1140384129390953|0.10839019459186253|0.10837755875663381
|11|0.10941369724538792 |
|12|0.10941369724538792 |
|13|0.10941369724538792 |

---
0.10941369724538792

In [141]:
# Recall@5 계산 함수
def recall5(answer_df, submission_df):
    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()
    individual_recalls = [len(set(true_dict[key]) & set(top_5_preds[key])) / min(len(val), 5) for key, val in true_dict.items() if key in top_5_preds]
    recall = np.mean(individual_recalls) if individual_recalls else 0
    return recall



In [48]:
# 사용자 간의 유사성 계산
user_similarity_train_random = cosine_similarity(user_article_matrix_train_random)


mean_similarity = np.mean(user_similarity_train_random[user_similarity_train_random > 0])

# 평균의 10%를 추가 가중치로 설정
additional_weight = mean_similarity * 1
print('Additional_CONTRY_WEIGHT : ', additional_weight)
user_index = user_article_matrix_train_random.index
for i, user_i in enumerate(user_index):
    countries_i = user_countries[user_i]
    for j, user_j in enumerate(user_index[i+1:], start=i+1):  # 대칭성을 고려하여 j를 i+1부터 시작
        countries_j = user_countries[user_j]
        if countries_i & countries_j:  # 두 사용자가 하나 이상의 공통 국가를 가지고 있는 경우
            user_similarity_train_random[i, j] -= (additional_weight)
            user_similarity_train_random[j, i] -= (additional_weight)  # 유사성 행렬은 대칭이므로



Additional_CONTRY_WEIGHT :  0.06977574562848603


In [143]:
def base_recoomand(user_article_matrix_train_random, user_similarity_train_random, df_return = False):
    def get_language_filtered_recommendations(user_article_matrix, user_similarity):
        user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T
        recommendations = []
        for idx, user in enumerate(user_article_matrix.index):
            
            sorted_indices = user_predicted_scores[idx].argsort()[::-1]
            top5recommend = user_article_matrix.columns[sorted_indices][:5].tolist()
            
            #---------------------------------
            # 유사점 점수를 먼저 추가하는 코드
            for article in top5recommend[:5]:
                recommendations.append([user, article])

        return recommendations

    language_filtered_recommendations_original = get_language_filtered_recommendations(user_article_matrix_train_random, user_similarity_train_random)

    # Convert recommendations to DataFrame
    language_filtered_recommendations_original_df = pd.DataFrame(language_filtered_recommendations_original, columns=['userID', 'articleID'])

    if df_return:
        print(recall5(test_data_for_comparison, language_filtered_recommendations_original_df))
        return language_filtered_recommendations_original_df
        
    # Calculate Recall@5 for the original algorithm's recommendations
    recall_at_5_original = recall5(test_data_for_comparison, language_filtered_recommendations_original_df)
    return recall_at_5_original

In [50]:
base_recoomand(user_article_matrix_train_random, user_similarity_train_random, df_return = False)



0.1889941875157948

In [None]:
0.2002906242102603
0.20434672731867573
0.20648218347232755
0.20703816022239072
0.20597675006317917
0.2058503917108921
0.2058503917108921
0.20827647207480413
0.20903462218852664
0.20903462218852664
