# Baseline code

In [None]:
# !pip install ace

In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

view_log_train = pd.read_csv('data/view_log.csv')
article_info = pd.read_csv('data/article_info.csv')
submission = pd.read_csv('data/sample_submission.csv')

# 유저별로 방문 기록의 최대 20%를 넘지 않는 범위 내에서 1~10개의 기사를 랜덤하게 테스트 데이터로 선택
def select_random_articles(group):
    max_articles = max(1, min(10, int(len(group) * 0.2)))  # 최소 1개, 최대 10개 또는 전체의 20%
    num_articles = np.random.randint(1, max_articles + 1)  # 랜덤 선택 개수 결정
    return np.random.choice(group.index, num_articles, replace=False)

# np.random.seed(42)

np.random.seed(42)
test_indices = view_log_train.groupby('userID').apply(select_random_articles).explode().astype(int)
test_data_random = view_log_train.loc[test_indices]
train_data_random = view_log_train.drop(test_indices)

# 사용자별 유니크 국가 및 지역 목록 생성
user_countries = view_log_train.groupby('userID')['userCountry'].unique().apply(set)

# 테스트 데이터 준비
test_data_for_comparison = test_data_random[['userID', 'articleID']].drop_duplicates()

# 트레이닝 데이터에 언어 정보 병합
train_data_with_lang_random = pd.merge(train_data_random, article_info[['articleID', 'Language']], on='articleID')

# 사용자-기사 행렬 생성
user_article_matrix_train_random = train_data_random.groupby(['userID', 'articleID']).size().unstack(fill_value=0)

# user_article_matrix_train_random = train_data_random.groupby(['userID', 'articleID']).size().reset_index(name='visit_count')
# user_article_matrix_train_random['log_weight'] = user_article_matrix_train_random['visit_count'].apply(lambda x: np.log1p(x))
# user_article_matrix_train_random = user_article_matrix_train_random.pivot(index='userID', columns='articleID', values='log_weight').fillna(0)

# 사용자 간의 유사성 계산
user_similarity_train_random = cosine_similarity(user_article_matrix_train_random)


mean_similarity = np.mean(user_similarity_train_random[user_similarity_train_random > 0])

# 평균의 10%를 추가 가중치로 설정
additional_weight = mean_similarity * 0.2
print('Additional_CONTRY_WEIGHT : ', additional_weight)
user_index = user_article_matrix_train_random.index
for i, user_i in enumerate(user_index):
    countries_i = user_countries[user_i]
    for j, user_j in enumerate(user_index[i+1:], start=i+1):  # 대칭성을 고려하여 j를 i+1부터 시작
        countries_j = user_countries[user_j]
        if countries_i & countries_j:  # 두 사용자가 하나 이상의 공통 국가를 가지고 있는 경우
            user_similarity_train_random[i, j] -= (additional_weight)
            user_similarity_train_random[j, i] -= (additional_weight)  # 유사성 행렬은 대칭이므로



In [None]:
# Recall@5 계산 함수
def recall5(answer_df, submission_df):
    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()
    individual_recalls = [len(set(true_dict[key]) & set(top_5_preds[key])) / min(len(val), 5) for key, val in true_dict.items() if key in top_5_preds]
    recall = np.mean(individual_recalls) if individual_recalls else 0
    return recall



In [None]:
import json
with open('embedding_recoomand.json', 'r') as f:
    embedding_reco_top2 = json.load(f)

# 언어 분리하여 추천 목록 만들기

### 다른 언어를 더 많이 봤는지 체크

- 소수 언어 기사에 대해서 로그의 10% 비중을 넘는다면 해당 언어 기사는 추천해줄 만하다?

In [None]:
# lang_recommand = {user_id : [] for user_id in submission['userID'].unique()}
# def check_recoomand_language(df, filter_lang, lang_recommand, lang_thershold = 0.1):
#     reco_lang_result = dict()
#     for user_id in df['userID'].unique():
        
#         value_counts = view_log_concat_lang[view_log_concat_lang['userID'] == user_id]['Language'].value_counts()
#         view_articles_cnt = value_counts.sum()
        
#         for idx, result in enumerate(value_counts/view_articles_cnt > lang_thershold):
#             lang = value_counts.index[idx]

#             if result:
#                 if lang not in lang_recommand[user_id]:
#                     lang_recommand[user_id].append(lang)
        

In [None]:
def base_recoomand(user_article_matrix_train_random, user_similarity_train_random, df_return = False):
    def get_language_filtered_recommendations(user_article_matrix, user_similarity, significant_languages):
        user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T
        recommendations = []
        for idx, user in enumerate(user_article_matrix.index):
            
            sorted_indices = user_predicted_scores[idx].argsort()[::-1]
            top5recommend = user_article_matrix.columns[sorted_indices][:5].tolist()
            
            #---------------------------------
            # 유사점 점수를 먼저 추가하는 코드
            
            for article in top5recommend[:5]:
                recommendations.append([user, article])
                
            # if len(embedding_reco_top2[user]):
            #     for embedding_reco in embedding_reco_top2[user]:
            #         if embedding_reco not in top5recommend[:4]:
            #             recommendations.append([user, embedding_reco])
            #             break
                
            # if len(recommendations) % 5 != 0:
            #     recommendations.append([user, top5recommend[-1]])
            #---------------------------------
            
            #---------------------------------
            # 임베딩 점수를 먼저 추가하는 코드
            # if len(embedding_reco_top2[user]):
            #     for embedding_reco in embedding_reco_top2[user]:
            #         recommendations.append([user, embedding_reco])
            
            # if len(recommendations) % 5 != 0 or len(embedding_reco_top2[user]) == 0:
            #     top5recommend = user_article_matrix.columns[sorted_indices][:5].tolist()
            #     for article in top5recommend:
            #         if article not in embedding_reco_top2[user]:
            #             recommendations.append([user, article])
                        
            #         if len(recommendations) % 5 == 0:
            #             break
            #---------------------------------
            
            
            # recommendations.append([user, 'ARTICLE_99998'])
        return recommendations

    language_filtered_recommendations_original = get_language_filtered_recommendations(user_article_matrix_train_random, user_similarity_train_random, user_significant_languages_original)

    # Convert recommendations to DataFrame
    language_filtered_recommendations_original_df = pd.DataFrame(language_filtered_recommendations_original, columns=['userID', 'articleID'])

    if df_return:
        return language_filtered_recommendations_original_df
        
    # Calculate Recall@5 for the original algorithm's recommendations
    recall_at_5_original = recall5(test_data_for_comparison, language_filtered_recommendations_original_df)
    return recall_at_5_original

In [None]:
base_recoomand(train_data_with_lang_random, user_article_matrix_train_random, user_similarity_train_random)
#0.2095653272681324

In [None]:
# base - 0.19732120293151376


#### 임베딩 임계
- 0.5 -> 0.21183977760929995
- 0.45 -> 0.21183977760929995
- 0.4 -> 0.21259792772302247
- 0.35 -> 0.21259792772302247
- 0.3 -> 0.2123452110184483



In [None]:
df.to_csv('submission_4.csv', index=False)

In [None]:
# 5번 빼기 0.18908263836239575
# 4번 빼기 0.18726307808946172
# 3번 빼기 0.1725676017184736
# 2번 빼기 0.16897902451352034
# 1번 빼끼 0.1294667677533485


# 독점 언어만으로 추천

In [None]:
def only_top_lang_reocommand(train_data_with_lang_random, user_article_matrix_train_random, user_similarity_train_random):
    def check_recoomand_language_variant(df, lang_threshold=0.3):
        user_languages = {}
        for user_id in df['userID'].unique():
            user_data = df[df['userID'] == user_id]
            value_counts = user_data['Language'].value_counts()
            total_views = value_counts.sum()
            language_ratio = value_counts / total_views
            dominant_language = language_ratio[language_ratio > lang_threshold]
            if not dominant_language.empty:
                user_languages[user_id] = dominant_language.idxmax()  # Assign the single dominant language
            else:
                significant_languages = value_counts[value_counts / total_views > 0.1].index.tolist()
                user_languages[user_id] = significant_languages
        return user_languages

    user_significant_languages_variant = check_recoomand_language_variant(train_data_with_lang_random)

    def get_language_filtered_recommendations_variant(user_article_matrix, user_similarity, significant_languages):
        user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T
        recommendations = []
        for idx, user in enumerate(user_article_matrix.index):
            if user in significant_languages:
                user_langs = [significant_languages[user]] if isinstance(significant_languages[user], str) else significant_languages[user]
                articles_filtered = user_article_matrix.columns.intersection(article_info[article_info['Language'].isin(user_langs)]['articleID'])
                filtered_indices = user_article_matrix.columns.get_indexer(articles_filtered)
                sorted_indices = user_predicted_scores[idx, filtered_indices].argsort()[::-1]
                top5recommend = articles_filtered.values[sorted_indices][:5]
                for article in top5recommend:
                    recommendations.append([user, article])
        return recommendations

    language_filtered_recommendations_variant = get_language_filtered_recommendations_variant(user_article_matrix_train_random, user_similarity_train_random, user_significant_languages_variant)

    language_filtered_recommendations_variant_df = pd.DataFrame(language_filtered_recommendations_variant, columns=['userID', 'articleID'])

    recall_at_5_variant = recall5(test_data_for_comparison, language_filtered_recommendations_variant_df)
    return recall_at_5_variant


In [None]:
only_top_lang_reocommand(train_data_with_lang_random, user_article_matrix_train_random, user_similarity_train_random)

# 소수 언어 포함시 무조건 해당 언어 기사 들어가도록 설정

In [None]:
def add_minority_articles(train_data_with_lang_random, user_article_matrix_train_random, user_similarity_train_random, article_info):

    language_counts = article_info['Language'].value_counts()
    minority_filter = language_counts/language_counts.sum() < 0.1  # 하위 10% 언어를 소수 언어로 가정
    minority_languages = language_counts[minority_filter].index

    def check_recommended_language(df, lang_threshold=0.01):
        value_counts = df['Language'].value_counts()
        total_views = value_counts.sum()
        minority_filter = value_counts/total_views < 0.1
        significant_languages = value_counts[minority_filter].index.tolist()
        
        user_languages = {}
        for user_id in df['userID'].unique():
            user_data = df[df['userID'] == user_id]
            value_counts = user_data['Language'].value_counts()
            total_views = value_counts.sum()
            minority_filter = value_counts/total_views < 0.1
            user_significant_languages = [lang for lang in value_counts[minority_filter].index.tolist() if lang in significant_languages]
            user_languages[user_id] = user_significant_languages
        return user_languages

    def get_language_filtered_recommendations_with_fallback_and_minority(user_article_matrix, user_similarity, significant_languages, user_index, article_info, minority_languages):
        user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T
        recommendations = []
        global_top_articles = article_info['articleID'].value_counts().head(5).index.tolist()

        for user in user_index:
            recommended_articles = []
            user_langs = significant_languages.get(user, [])

            # 사용자의 중요 언어를 기반으로 추천
            if user_langs:
                articles_filtered = user_article_matrix.columns.intersection(article_info[article_info['Language'].isin(user_langs)]['articleID'])
                filtered_indices = user_article_matrix.columns.get_indexer(articles_filtered)
                sorted_indices = user_predicted_scores[user_article_matrix.index.get_loc(user), filtered_indices].argsort()[::-1]
                recommended_articles.extend(articles_filtered.values[sorted_indices][:5])

                # 중요 언어 중 소수 언어가 있을 경우 그 언어의 기사를 포함
                for lang in set(user_langs).intersection(minority_languages):
                    minority_articles = article_info[article_info['Language'] == lang]['articleID'].values
                    # 최소 한 개의 소수 언어 기사를 추가
                    for article in minority_articles:
                        if article not in recommended_articles:
                            recommended_articles.insert(0, article)  # 최상위에 추가
                            break

            # 기사 수가 충분하지 않은 경우 일반적으로 인기 있는 기사 추가
            while len(recommended_articles) < 5:
                for article in global_top_articles:
                    if article not in recommended_articles:
                        recommended_articles.append(article)
                        if len(recommended_articles) == 5:
                            break

            recommendations.extend([(user, article) for article in recommended_articles[:5]])

        return recommendations

    # 추천 로직 실행
    user_significant_languages_train_random = check_recommended_language(train_data_with_lang_random)
    test_user_index_random = test_data_random['userID'].unique()
    language_filtered_recommendations_random = get_language_filtered_recommendations_with_fallback_and_minority(
        user_article_matrix_train_random, user_similarity_train_random, user_significant_languages_train_random, test_user_index_random, article_info, minority_languages
    )

    # 결과 DataFrame으로 변환
    language_filtered_recommendations_random_df = pd.DataFrame(language_filtered_recommendations_random, columns=['userID', 'articleID'])

    # Calculate Recall@5 for the modified recommendations including minority language articles
    recall_at_5_with_minority = recall5(test_data_for_comparison, language_filtered_recommendations_random_df)
    return recall_at_5_with_minority

In [None]:
add_minority_articles(train_data_with_lang_random, user_article_matrix_train_random, user_similarity_train_random, article_info)

In [None]:
socres = {
    'weight' : [],
    'case1' : [],
    'case2' : [],
    'case3' : [],
}

In [None]:
from tqdm import tqdm
import json

for idx in tqdm(range(0, 101)):
    # 사용자 간의 유사성 계산
    user_similarity_train_random = cosine_similarity(user_article_matrix_train_random)


    mean_similarity = np.mean(user_similarity_train_random[user_similarity_train_random > 0])

    additional_weight = mean_similarity * (idx * 0.5)
    print('Additional_WEIGHT : ', additional_weight)
    user_index = user_article_matrix_train_random.index
    for i, user_i in enumerate(user_index):
        countries_i = user_countries[user_i]
        for j, user_j in enumerate(user_index[i+1:], start=i+1):  # 대칭성을 고려하여 j를 i+1부터 시작
            countries_j = user_countries[user_j]
            if countries_i & countries_j:  # 두 사용자가 하나 이상의 공통 국가를 가지고 있는 경우
                user_similarity_train_random[i, j] += additional_weight
                user_similarity_train_random[j, i] += additional_weight  # 유사성 행렬은 대칭이므로


    case1_score = base_recoomand(train_data_with_lang_random, user_article_matrix_train_random, user_similarity_train_random)
    case2_score = only_top_lang_reocommand(train_data_with_lang_random, user_article_matrix_train_random, user_similarity_train_random)
    case3_score = add_minority_articles(train_data_with_lang_random, user_article_matrix_train_random, user_similarity_train_random, article_info)

    socres['weight'].append(idx)
    socres['case1'].append(case1_score)
    socres['case2'].append(case2_score)
    socres['case3'].append(case3_score)

    with open('scores.json', 'w') as f:
            json.dump(socres, f)


In [None]:
socres = {
    'weight' : [],
    'case1' : [],
    # 'case2' : [],
    # 'case3' : [],
}

In [None]:
from tqdm import tqdm
import json

for idx in tqdm(range(0, 50)):
    # 사용자 간의 유사성 계산
    user_similarity_train_random = cosine_similarity(user_article_matrix_train_random)


    mean_similarity = np.mean(user_similarity_train_random[user_similarity_train_random > 0])

    # 평균의 10%를 추가 가중치로 설정
    additional_weight = mean_similarity * 0.2
    additional_region_weight = mean_similarity * (idx * 0.05)
    print('Additional_CONTRY_WEIGHT : ', additional_weight, 'Additional_REGION_WEIGHT: ', additional_region_weight)
    user_index = user_article_matrix_train_random.index
    for i, user_i in enumerate(user_index):
        countries_i = user_countries[user_i]
        regions_i = user_regions[user_i]
        for j, user_j in enumerate(user_index[i+1:], start=i+1):  # 대칭성을 고려하여 j를 i+1부터 시작
            countries_j = user_countries[user_j]
            regions_j = user_regions[user_j]

            if countries_i & countries_j:  # 두 사용자가 하나 이상의 공통 국가를 가지고 있는 경우
                user_similarity_train_random[i, j] -= (additional_weight)
                user_similarity_train_random[j, i] -= (additional_weight)
            
            if regions_i & regions_j:
                user_similarity_train_random[i, j] += (additional_region_weight)
                user_similarity_train_random[j, i] += (additional_region_weight)
                


    case1_score = base_recoomand(train_data_with_lang_random, user_article_matrix_train_random, user_similarity_train_random)
    # case2_score = only_top_lang_reocommand(train_data_with_lang_random, user_article_matrix_train_random, user_similarity_train_random)
    # case3_score = add_minority_articles(train_data_with_lang_random, user_article_matrix_train_random, user_similarity_train_random, article_info)

    socres['weight'].append(idx)
    socres['case1'].append(case1_score)
    # socres['case2'].append(case2_score)
    # socres['case3'].append(case3_score)

    with open('scores_add_region.json', 'w') as f:
            json.dump(socres, f)


In [None]:
# 트레이닝 데이터에 언어 정보 병합
view_log_with_lang = pd.merge(view_log_train, article_info[['articleID', 'Language']], on='articleID')

# 사용자-기사 행렬 생성
user_article_matrix = view_log_train.groupby(['userID', 'articleID']).size().unstack(fill_value=0)

# 사용자 간의 유사성 계산
user_similarity= cosine_similarity(user_article_matrix)


mean_similarity = np.mean(user_similarity[user_similarity > 0])

# 평균의 10%를 추가 가중치로 설정
additional_weight = mean_similarity * 0.2
print('Additional_CONTRY_WEIGHT : ', additional_weight)
user_index = user_article_matrix.index
for i, user_i in enumerate(user_index):
    countries_i = user_countries[user_i]
    for j, user_j in enumerate(user_index[i+1:], start=i+1):  # 대칭성을 고려하여 j를 i+1부터 시작
        countries_j = user_countries[user_j]
        if countries_i & countries_j:  # 두 사용자가 하나 이상의 공통 국가를 가지고 있는 경우
            user_similarity[i, j] -= (additional_weight)
            user_similarity[j, i] -= (additional_weight)  # 유사성 행렬은 대칭이므로



In [None]:
df = base_recoomand(view_log_with_lang, user_article_matrix, user_similarity, df_return=True)

## grid_search

In [None]:
import json
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

view_log = pd.read_csv('data/view_log.csv')
article_info = pd.read_csv('data/article_info.csv')
submission = pd.read_csv('data/sample_submission.csv')

# with open('title_relevance_docs_top10.json', 'r') as f:
#     title_relvance_score_dict = json.load(f)


# Recall@5 계산 함수
def recall5(answer_df, submission_df):
    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()
    individual_recalls = [len(set(true_dict[key]) & set(top_5_preds[key])) / min(len(val), 5) for key, val in true_dict.items() if key in top_5_preds]
    recall = np.mean(individual_recalls) if individual_recalls else 0
    return recall



In [None]:
def base_recoomand(user_article_matrix_train_random, user_similarity_train_random, person_reco_dics, test_data_for_comparison, df_return = False):
    def get_language_filtered_recommendations(user_article_matrix, user_similarity, person_reco_dics):
        user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T
        recommendations = []
        for idx, user in enumerate(user_article_matrix.index):
            
            sorted_indices = user_predicted_scores[idx].argsort()[::-1]
            
            
            #---------------------------------
            # 유사점 점수를 먼저 추가하는 코드
            top5recommend = user_article_matrix.columns[sorted_indices][:5].tolist()
            for article in top5recommend[:5]:
                recommendations.append([user, article])
                
        return recommendations

    language_filtered_recommendations_original = get_language_filtered_recommendations(user_article_matrix_train_random, user_similarity_train_random, person_reco_dics)

    # Convert recommendations to DataFrame
    language_filtered_recommendations_original_df = pd.DataFrame(language_filtered_recommendations_original, columns=['userID', 'articleID'])

    if df_return:
        return language_filtered_recommendations_original_df
        
    # Calculate Recall@5 for the original algorithm's recommendations
    recall_at_5_original = recall5(test_data_for_comparison, language_filtered_recommendations_original_df)
    return recall_at_5_original

In [None]:
def add_minority_articles(train_data_with_lang_random, user_article_matrix_train_random, user_similarity_train_random, article_info, person_reco_dics, test_data_for_comparison, lang_thresholds, train_data_random, df_return = False):

    language_counts = article_info['Language'].value_counts()
    minority_filter = language_counts/language_counts.sum() < 0.1  # 하위 10% 언어를 소수 언어로 가정
    minority_languages = language_counts[minority_filter].index

    def check_recommended_language(df, minority_languages, lang_threshold=0.01):
        
        user_languages = {}
        for user_id in df['userID'].unique():
            user_data = df[df['userID'] == user_id]
            value_counts = user_data['Language'].value_counts()
            total_views = value_counts.sum()
            minority_filter = value_counts/total_views < lang_threshold
            if len(value_counts[minority_filter].index.tolist()):
                user_languages[user_id] = []
                continue
            user_significant_languages = [lang for lang in value_counts[minority_filter].index.tolist() if lang in minority_languages]
            user_languages[user_id] = user_significant_languages
        return user_languages

    def get_language_filtered_recommendations_with_fallback_and_minority(user_article_matrix, user_similarity, significant_languages, article_info, person_reco_dics, train_data_random):
        user_predicted_scores = user_similarity.dot(user_article_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T
        recommendations = []
        

        for idx, user in enumerate(user_article_matrix.index):
            recommended_articles = []
            user_langs = significant_languages[user]
            sorted_indices = user_predicted_scores[idx].argsort()[::-1]
            
            user_seen = train_data_random[train_data_random['userID'] == user]['articleID'].values
            # 중요 언어 중 소수 언어가 있을 경우 그 언어의 기사를 포함
            for lang in user_langs:
                minority_articles = article_info[article_info['Language'] == lang]['articleID'].values
                # 최소 한 개의 소수 언어 기사를 추가
                
                for article in minority_articles:
                    if article not in user_seen:
                        print(f'MINORITY ARTICLE IN {user} <- {article}')
                        recommended_articles.append(article)  # 하나의 기사만 추가
                        break
                    
            #---------------------------------
            # 기사 수가 충분하지 않은 경우 일반적으로 추천 방식의 기사 추가
            while len(recommended_articles) < 5:
                for article in user_article_matrix.columns[sorted_indices].tolist():
                    if article not in recommended_articles:
                        recommended_articles.append(article)
                        if len(recommended_articles) == 5:
                            break
            #---------------------------------
            # 유사점 점수를 먼저 추가하는 코드
            # for article in global_top_articles:
            #     if len(recommended_articles) == 5:
            #         break
                
            #     if article not in recommended_articles:
            #         recommended_articles.append(article)
                    
            # if len(person_reco_dics[user]):
            #     for embedding_reco in person_reco_dics[user]:
            #         if len(recommended_articles) == 5:
            #             recommended_articles.pop()
            #         recommended_articles.append(embedding_reco)
                
            #     if len(recommended_articles) == 5:
            #             break
                    
            #---------------------------------
            
            #---------------------------------
            # 임베딩 점수를 먼저 추가하는 코드
            # if len(person_reco_dics[user]):
            #     for embedding_reco in person_reco_dics[user]:
                    
            #         recommended_articles.append(embedding_reco)
                    
            #         if len(recommended_articles) == 5:
            #             break
                    
            # for article in global_top_articles:
            #     if len(recommended_articles) == 5:
            #         break
                
            #     if article not in recommended_articles:
            #         recommended_articles.pop()
            #         recommended_articles.append(article)
                    
                
            #---------------------------------

            recommendations.extend([(user, article) for article in recommended_articles[:5]])

        return recommendations

    # 추천 로직 실행
    user_significant_languages_train_random = check_recommended_language(train_data_with_lang_random, minority_languages, lang_thresholds)
    test_user_index_random = test_data_random['userID'].unique()
    language_filtered_recommendations_random = get_language_filtered_recommendations_with_fallback_and_minority(
        user_article_matrix_train_random, user_similarity_train_random, user_significant_languages_train_random, article_info, person_reco_dics, train_data_random
    )

    # 결과 DataFrame으로 변환
    language_filtered_recommendations_random_df = pd.DataFrame(language_filtered_recommendations_random, columns=['userID', 'articleID'])

    # Calculate Recall@5 for the modified recommendations including minority language articles
    recall_at_5_with_minority = recall5(test_data_for_comparison, language_filtered_recommendations_random_df)
    return recall_at_5_with_minority

In [None]:
import matplotlib.pyplot as plt
import itertools
from collections import defaultdict

# 그리드 서치 함수
def grid_search(nation_weight_values, visit_log_mins, remain_cnts, seeds, lang_thresholds =[]):
    def select_random_articles(group):
        max_articles = max(1, min(5, int(len(group) * 0.2)))  # 최소 1개, 최대 10개 또는 전체의 20%
        num_articles = np.random.randint(0, max_articles + 1)  # 랜덤 선택 개수 결정
        return np.random.choice(group.index, num_articles, replace=False)

    results = []
    for seed in seeds:
        print('START SEED : ', seed)
        np.random.seed(seed)
        for remain_cnt in tqdm(remain_cnts):
            test_drop_idxs = []
            test_indices = view_log.drop_duplicates().groupby('userID').apply(select_random_articles).explode().dropna().astype(int)
            test_data_random = view_log.loc[test_indices]
            train_data_random = view_log.drop(test_indices)
            
            for u_id in (test_data_random['userID'].values):
                test_articles = test_data_random[test_data_random['userID'] == u_id]['articleID'].values
                tmp = train_data_random[train_data_random['userID'] == u_id]
                test_drop_idxs += tmp[tmp['articleID'].isin(test_articles)].index.to_list()
            train_data_random.drop(index = test_drop_idxs)
            
            user_countries = view_log.groupby('userID')['userCountry'].unique().apply(set)
            test_data_for_comparison = test_data_random[['userID', 'articleID']]
            
            drop_idx = []
            for user_id in train_data_random['userID'].unique():
                user_writed_articles = article_info[article_info['userID'] == user_id]['articleID'].values
                user_view_log = train_data_random[train_data_random['userID'] == user_id]
                self_view_df = user_view_log[user_view_log['articleID'].isin(user_writed_articles)]
                self_ivew_list = self_view_df.index.to_list()
                remain_idx = self_view_df.groupby(['userID','articleID']).head(remain_cnt).index.to_list()
                drop_idx += [idx for idx in self_ivew_list if idx not in remain_idx]

            train_data_random = train_data_random.drop(index=drop_idx)

            seed_results = []
            
            for nation_weight, visit_log_min in itertools.product(nation_weight_values, visit_log_mins):
                user_article_matrix_train_random = train_data_random.groupby(['userID', 'articleID']).size().reset_index(name='visit_count')
                user_article_matrix_train_random['visit_count'] = user_article_matrix_train_random['visit_count'].apply(lambda x: min(x, visit_log_min))
                user_article_matrix_train_random['log_weight'] = user_article_matrix_train_random['visit_count'].apply(lambda x: np.log1p(x))
                user_article_matrix_train_random = user_article_matrix_train_random.pivot(index='userID', columns='articleID', values='log_weight').fillna(0)
                # for nation_weight, similarity_threshold, lang_threshold in itertools.product(nation_weight_values, similarity_threshold_values, lang_thresholds):
                
                user_similarity_train_random = cosine_similarity(user_article_matrix_train_random)

                mean_similarity = np.mean(user_similarity_train_random[user_similarity_train_random > 0])
                additional_weight = mean_similarity * nation_weight
                user_index = user_article_matrix_train_random.index
                for i, user_i in enumerate(user_index):
                    countries_i = user_countries[user_i]
                    for j, user_j in enumerate(user_index[i + 1:], start=i + 1):
                        countries_j = user_countries[user_j]
                        if countries_i & countries_j:
                            user_similarity_train_random[i, j] -= additional_weight
                            user_similarity_train_random[j, i] -= additional_weight                    

                person_reco_dics = {id: [] for id in train_data_random['userID'].unique()}

                current_recall = base_recoomand(user_article_matrix_train_random, user_similarity_train_random, person_reco_dics, test_data_for_comparison, df_return=False)
                seed_results.append({
                    'seed': seed,
                    'nation_weight': nation_weight,
                    'visit_log_mins' : visit_log_min,
                    'remain_cnts': remain_cnt,
                    # 'lang_threshold' : lang_threshold,
                    'recall': current_recall
                })

                results.extend(seed_results)
                pd.DataFrame(results).to_csv('pararm29_search_best_params_nationMinus_visitLogMinrange.csv', index=False)

        
        

    return results


In [None]:
import numpy as np
import random
from tqdm import tqdm
# Define the range of values for nation_weight and similarity_threshold
nation_weight_values = [0.79, 0.8, 1.0]#np.arange(0, 2.01, 0.01)
visit_log_mins = [8,9,10,11,12,13,14,15]
remain_cnts = [4,10,999]

random.seed(777)
seeds = random.sample([i for i in range(9999)], 30)



# Run grid search
grid_search_results = grid_search(nation_weight_values, visit_log_mins, remain_cnts, seeds, 
                                #   lang_thresholds
                                  )


기사 타이틀간 임베딩 값으로 각 임베딩에 대한 스칼라를 구하고</br>
유저가 방문했던 기사들의 벡터 열을 가져와서 합함 -> 여기서 정규화를 어떻게??</br>
가져온 열 값을 유저dot기사 메트릭스에 추가 -> 가중치는 얼마?</br>
-> 결과 확인