In [1]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from collections import defaultdict
from sklearn.decomposition import TruncatedSVD, NMF, SparsePCA
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def recall5(answer_df, submission_df):
    """
    Calculate recall@5 for given dataframes.
    
    Parameters:
    - answer_df: DataFrame containing the ground truth
    - submission_df: DataFrame containing the predictions
    
    Returns:
    - recall: Recall@5 value
    """
    
    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]
    
    # Check if each primary_col entry has exactly 5 secondary_col predictions
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # Check for NULL values in the predicted secondary_col
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")
    
    # Check for duplicates in the predicted secondary_col for each primary_col
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]
    
    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()
    
    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()
    
    
    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함 
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [3]:
apply_train = pd.read_csv('./apply_train.csv')

In [4]:
from sklearn.metrics.pairwise import cosine_similarity

def adjusted_cosine_similarity(matrix):
    # 각 행 또는 열의 평균을 구합니다.
    row_means = np.nanmean(matrix, axis=1, keepdims=True)
    col_means = np.nanmean(matrix, axis=0, keepdims=True)

    # NaN 값을 0으로 대체합니다.
    filled_matrix = np.where(np.isnan(matrix), 0, matrix)

    # 행과 열의 평균을 뺍니다.
    matrix_centered = filled_matrix - row_means - col_means + np.nanmean(matrix)

    # 코사인 유사도 계산
    similarity = cosine_similarity(matrix_centered)
    return similarity


In [5]:
#학습, 검증 분리
train, val = [], []
apply_train_groupby = apply_train.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(apply_train_groupby.index.tolist(), apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        train.append([uid,iid])
    val.append([uid, iids[-1]])

In [6]:
train = pd.DataFrame(train, columns=['resume_seq', 'recruitment_seq'])
val = pd.DataFrame(val, columns=['resume_seq', 'recruitment_seq'])
pred = apply_train.copy()

In [7]:
train_user_item_matrix = train.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
pred_user_item_matrix = pred.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)

In [8]:
# # 훈련 데이터에 대한 조정 코사인 유사도 계산
# train_user_similarity = adjusted_cosine_similarity(train_user_item_matrix)
# train_item_similarity = adjusted_cosine_similarity(train_user_item_matrix.T)

# # 예측 데이터에 대한 조정 코사인 유사도 계산
# pred_user_similarity = adjusted_cosine_similarity(pred_user_item_matrix)
# pred_item_similarity = adjusted_cosine_similarity(pred_user_item_matrix.T)

In [9]:
train_user_similarity = cosine_similarity(train_user_item_matrix)
train_item_similarity = cosine_similarity(train_user_item_matrix.T)

pred_user_similarity = cosine_similarity(pred_user_item_matrix)
pred_item_similarity = cosine_similarity(pred_user_item_matrix.T)

In [10]:
train_user_predicted_scores = train_user_similarity.dot(train_user_item_matrix)
train_item_predicted_scores = train_user_item_matrix.dot(train_item_similarity)

pred_user_predicted_scores = pred_user_similarity.dot(pred_user_item_matrix)
pred_item_predicted_scores = pred_user_item_matrix.dot(pred_item_similarity)

In [11]:
# alpha = 0.98
# train_recommendations = []
# for idx, user in tqdm(enumerate(train_user_item_matrix.index)):
#     applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)
    
#     # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
#     sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx]).argsort()[::-1]
#     recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]
    
#     for job in recommended_jobs:
#         train_recommendations.append([user, job])

In [13]:
import numpy as np

alpha_values = np.arange(0.30, 1.0, 0.01)  

best_alpha = None
best_recall = 0

for alpha in alpha_values:
    train_recommendations = []
    for idx, user in tqdm(enumerate(train_user_item_matrix.index)):
        applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)
        
        # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
        sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx]).argsort()[::-1]
        recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]
        
        for job in recommended_jobs:
            train_recommendations.append([user, job])
    
    val_prediction = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])
    recall = recall5(val, val_prediction)
    
    print(f"Alpha: {alpha}, Recall@5: {recall}")

    if recall > best_recall:
        best_recall = recall
        best_alpha = alpha

print(f"Best Alpha: {best_alpha}, Best Recall@5: {best_recall}")


0it [00:00, ?it/s]

8482it [00:12, 683.91it/s]


Alpha: 0.3, Recall@5: 0.1279179438811601


8482it [00:12, 669.91it/s]


Alpha: 0.31, Recall@5: 0.1280358406036312


8482it [00:12, 702.51it/s]


Alpha: 0.32, Recall@5: 0.1279179438811601


8482it [00:11, 724.01it/s]


Alpha: 0.33, Recall@5: 0.12768215043621786


8482it [00:11, 725.07it/s]


Alpha: 0.34, Recall@5: 0.1279179438811601


8482it [00:11, 723.05it/s]


Alpha: 0.35000000000000003, Recall@5: 0.1280358406036312


8482it [00:11, 730.08it/s]


Alpha: 0.36000000000000004, Recall@5: 0.12744635699127566


8482it [00:11, 738.22it/s]


Alpha: 0.37000000000000005, Recall@5: 0.12732846026880454


8482it [00:11, 751.49it/s]


Alpha: 0.38000000000000006, Recall@5: 0.12756425371374674


8482it [00:11, 738.25it/s]


Alpha: 0.39000000000000007, Recall@5: 0.12744635699127566


8482it [00:11, 735.96it/s]


Alpha: 0.4000000000000001, Recall@5: 0.12768215043621786


8482it [00:11, 731.53it/s]


Alpha: 0.4100000000000001, Recall@5: 0.1280358406036312


8482it [00:11, 715.12it/s]


Alpha: 0.4200000000000001, Recall@5: 0.12815373732610233


8482it [00:12, 704.16it/s]


Alpha: 0.4300000000000001, Recall@5: 0.12815373732610233


8482it [00:11, 714.76it/s]


Alpha: 0.4400000000000001, Recall@5: 0.1280358406036312


8482it [00:12, 703.90it/s]


Alpha: 0.4500000000000001, Recall@5: 0.12780004715868898


8482it [00:12, 681.41it/s]


Alpha: 0.46000000000000013, Recall@5: 0.12721056354633342


8482it [00:11, 719.62it/s]


Alpha: 0.47000000000000014, Recall@5: 0.12756425371374674


8482it [00:11, 711.07it/s]


Alpha: 0.48000000000000015, Recall@5: 0.12768215043621786


8482it [00:12, 705.17it/s]


Alpha: 0.49000000000000016, Recall@5: 0.12768215043621786


8482it [00:11, 708.47it/s]


Alpha: 0.5000000000000002, Recall@5: 0.12780004715868898


8482it [00:12, 674.79it/s]


Alpha: 0.5100000000000002, Recall@5: 0.1279179438811601


8482it [00:12, 696.90it/s]


Alpha: 0.5200000000000002, Recall@5: 0.12780004715868898


8482it [00:11, 718.49it/s]


Alpha: 0.5300000000000002, Recall@5: 0.1280358406036312


8482it [00:12, 705.39it/s]


Alpha: 0.5400000000000003, Recall@5: 0.12838953077104456


8482it [00:11, 716.27it/s]


Alpha: 0.5500000000000003, Recall@5: 0.12838953077104456


8482it [00:11, 710.23it/s]


Alpha: 0.5600000000000003, Recall@5: 0.12838953077104456


8482it [00:11, 728.92it/s]


Alpha: 0.5700000000000003, Recall@5: 0.12850742749351568


8482it [00:12, 696.55it/s]


Alpha: 0.5800000000000003, Recall@5: 0.12886111766092903


8482it [00:12, 695.89it/s]


Alpha: 0.5900000000000003, Recall@5: 0.12874322093845791


8482it [00:12, 693.37it/s]


Alpha: 0.6000000000000003, Recall@5: 0.12886111766092903


8482it [00:11, 707.03it/s]


Alpha: 0.6100000000000003, Recall@5: 0.1286253242159868


8482it [00:11, 736.23it/s]


Alpha: 0.6200000000000003, Recall@5: 0.1286253242159868


8482it [00:11, 747.56it/s]


Alpha: 0.6300000000000003, Recall@5: 0.12838953077104456


8482it [00:11, 731.50it/s]


Alpha: 0.6400000000000003, Recall@5: 0.12838953077104456


8482it [00:12, 692.11it/s]


Alpha: 0.6500000000000004, Recall@5: 0.12815373732610233


8482it [00:11, 711.08it/s]


Alpha: 0.6600000000000004, Recall@5: 0.1279179438811601


8482it [00:11, 734.50it/s]


Alpha: 0.6700000000000004, Recall@5: 0.1280358406036312


8482it [00:11, 709.72it/s]


Alpha: 0.6800000000000004, Recall@5: 0.12850742749351568


8482it [00:11, 720.39it/s]


Alpha: 0.6900000000000004, Recall@5: 0.12874322093845791


8482it [00:11, 724.98it/s]


Alpha: 0.7000000000000004, Recall@5: 0.1286253242159868


8482it [00:11, 730.12it/s]


Alpha: 0.7100000000000004, Recall@5: 0.12874322093845791


8482it [00:11, 725.00it/s]


Alpha: 0.7200000000000004, Recall@5: 0.1286253242159868


8482it [00:12, 701.40it/s]


Alpha: 0.7300000000000004, Recall@5: 0.1286253242159868


8482it [00:12, 691.47it/s]


Alpha: 0.7400000000000004, Recall@5: 0.12850742749351568


8482it [00:11, 711.97it/s]


Alpha: 0.7500000000000004, Recall@5: 0.12838953077104456


8482it [00:12, 699.00it/s]


Alpha: 0.7600000000000005, Recall@5: 0.12815373732610233


8482it [00:11, 716.50it/s]


Alpha: 0.7700000000000005, Recall@5: 0.1280358406036312


8482it [00:11, 725.52it/s]


Alpha: 0.7800000000000005, Recall@5: 0.1279179438811601


8482it [00:12, 705.18it/s]


Alpha: 0.7900000000000005, Recall@5: 0.1279179438811601


8482it [00:16, 515.73it/s]


Alpha: 0.8000000000000005, Recall@5: 0.12827163404857345


8482it [00:11, 715.01it/s]


Alpha: 0.8100000000000005, Recall@5: 0.12827163404857345


8482it [00:12, 700.39it/s]


Alpha: 0.8200000000000005, Recall@5: 0.12827163404857345


8482it [00:12, 702.43it/s]


Alpha: 0.8300000000000005, Recall@5: 0.12815373732610233


8482it [00:11, 717.99it/s]


Alpha: 0.8400000000000005, Recall@5: 0.1280358406036312


8482it [00:12, 693.44it/s]


Alpha: 0.8500000000000005, Recall@5: 0.1279179438811601


8482it [00:12, 682.22it/s]


Alpha: 0.8600000000000005, Recall@5: 0.12780004715868898


8482it [00:12, 706.59it/s]


Alpha: 0.8700000000000006, Recall@5: 0.12815373732610233


8482it [00:11, 733.84it/s]


Alpha: 0.8800000000000006, Recall@5: 0.12815373732610233


8482it [00:11, 738.37it/s]


Alpha: 0.8900000000000006, Recall@5: 0.1280358406036312


8482it [00:11, 737.01it/s]


Alpha: 0.9000000000000006, Recall@5: 0.12827163404857345


8482it [00:12, 688.62it/s]


Alpha: 0.9100000000000006, Recall@5: 0.12838953077104456


8482it [00:11, 718.31it/s]


Alpha: 0.9200000000000006, Recall@5: 0.12827163404857345


8482it [00:11, 707.23it/s]


Alpha: 0.9300000000000006, Recall@5: 0.12850742749351568


8482it [00:11, 709.04it/s]


Alpha: 0.9400000000000006, Recall@5: 0.12850742749351568


8482it [00:11, 739.13it/s]


Alpha: 0.9500000000000006, Recall@5: 0.12827163404857345


8482it [00:10, 780.00it/s]


Alpha: 0.9600000000000006, Recall@5: 0.12886111766092903


8482it [00:10, 777.54it/s]


Alpha: 0.9700000000000006, Recall@5: 0.12909691110587126


8482it [00:10, 783.72it/s]


Alpha: 0.9800000000000006, Recall@5: 0.12909691110587126


8482it [00:11, 763.09it/s]


Alpha: 0.9900000000000007, Recall@5: 0.12909691110587126
Best Alpha: 0.9700000000000006, Best Recall@5: 0.12909691110587126


In [73]:
val_prediction = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])

In [74]:
recall5(val,val_prediction)

0.1279179438811601

In [75]:
recall5(val,val_prediction)

0.1279179438811601

In [54]:
alpha = 0.98
pred_recommendations = []
for idx, user in tqdm(enumerate(pred_user_item_matrix.index)):
    applied_jobs = set(pred_user_item_matrix.loc[user][pred_user_item_matrix.loc[user] == 1].index)
    
    sorted_job_indices = (pred_item_predicted_scores.loc[user].values * alpha + pred_user_predicted_scores[idx]).argsort()[::-1]
    recommended_jobs = [job for job in pred_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]
    
    for job in recommended_jobs:
        pred_recommendations.append([user, job])

0it [00:00, ?it/s]

8482it [00:23, 367.43it/s]


In [55]:
top_recommendations = pd.DataFrame(pred_recommendations, columns=['resume_seq', 'recruitment_seq'])
top_recommendations.to_csv('E:/국민대/open/models/baseline_add_item_0.97.csv', index=False)