In [1]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from collections import defaultdict
from sklearn.decomposition import TruncatedSVD, NMF, SparsePCA
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def recall5(answer_df, submission_df):
    """
    Calculate recall@5 for given dataframes.
    
    Parameters:
    - answer_df: DataFrame containing the ground truth
    - submission_df: DataFrame containing the predictions
    
    Returns:
    - recall: Recall@5 value
    """
    
    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]
    
    # Check if each primary_col entry has exactly 5 secondary_col predictions
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # Check for NULL values in the predicted secondary_col
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")
    
    # Check for duplicates in the predicted secondary_col for each primary_col
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]
    
    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()
    
    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()
    
    
    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함 
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [3]:
apply_train = pd.read_csv('./apply_train.csv')

In [4]:
from scipy.stats import pearsonr

def pearson_correlation(matrix):
    # NaN 값을 0으로 채우기
    filled_matrix = np.nan_to_num(matrix)

    # 피어슨 상관계수 계산
    correlation = np.corrcoef(filled_matrix)
    return correlation


In [5]:
#학습, 검증 분리
train, val = [], []
apply_train_groupby = apply_train.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(apply_train_groupby.index.tolist(), apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        train.append([uid,iid])
    val.append([uid, iids[-1]])

In [6]:
train = pd.DataFrame(train, columns=['resume_seq', 'recruitment_seq'])
val = pd.DataFrame(val, columns=['resume_seq', 'recruitment_seq'])
pred = apply_train.copy()

In [7]:
train_user_item_matrix = train.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
pred_user_item_matrix = pred.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)

In [8]:
train_user_similarity = pearson_correlation(train_user_item_matrix)
train_item_similarity = pearson_correlation(train_user_item_matrix.T)

pred_user_similarity = pearson_correlation(pred_user_item_matrix)
pred_item_similarity = pearson_correlation(pred_user_item_matrix.T)

In [9]:
train_user_predicted_scores = train_user_similarity.dot(train_user_item_matrix)
train_item_predicted_scores = train_user_item_matrix.dot(train_item_similarity)

pred_user_predicted_scores = pred_user_similarity.dot(pred_user_item_matrix)
pred_item_predicted_scores = pred_user_item_matrix.dot(pred_item_similarity)

In [10]:
# alpha = 0.98
# train_recommendations = []
# for idx, user in tqdm(enumerate(train_user_item_matrix.index)):
#     applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)
    
#     # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
#     sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx]).argsort()[::-1]
#     recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]
    
#     for job in recommended_jobs:
#         train_recommendations.append([user, job])

In [11]:
import numpy as np

alpha_values = np.arange(0.01, 1.0, 0.01) 

best_alpha = None
best_recall = 0

for alpha in alpha_values:
    train_recommendations = []
    for idx, user in tqdm(enumerate(train_user_item_matrix.index)):
        applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)
        
        # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
        sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx]).argsort()[::-1]
        recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]
        
        for job in recommended_jobs:
            train_recommendations.append([user, job])
    
    val_prediction = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])
    recall = recall5(val, val_prediction)
    
    print(f"Alpha: {alpha}, Recall@5: {recall}")

    if recall > best_recall:
        best_recall = recall
        best_alpha = alpha

print(f"Best Alpha: {best_alpha}, Best Recall@5: {best_recall}")


8482it [00:14, 583.43it/s]


Alpha: 0.01, Recall@5: 0.1254421127092667


8482it [00:14, 585.27it/s]


Alpha: 0.02, Recall@5: 0.1256779061542089


8482it [00:14, 600.44it/s]


Alpha: 0.03, Recall@5: 0.12520631926432446


8482it [00:14, 577.71it/s]


Alpha: 0.04, Recall@5: 0.12497052581938223


8482it [00:14, 598.94it/s]


Alpha: 0.05, Recall@5: 0.12497052581938223


8482it [00:13, 612.27it/s]


Alpha: 0.060000000000000005, Recall@5: 0.12508842254185334


8482it [00:14, 600.87it/s]


Alpha: 0.06999999999999999, Recall@5: 0.12508842254185334


8482it [00:14, 600.23it/s]


Alpha: 0.08, Recall@5: 0.1256779061542089


8482it [00:15, 553.73it/s]


Alpha: 0.09, Recall@5: 0.12579580287668002


8482it [00:14, 589.93it/s]


Alpha: 0.09999999999999999, Recall@5: 0.12591369959915114


8482it [00:14, 601.98it/s]


Alpha: 0.11, Recall@5: 0.12591369959915114


8482it [00:14, 600.85it/s]


Alpha: 0.12, Recall@5: 0.12591369959915114


8482it [00:14, 581.62it/s]


Alpha: 0.13, Recall@5: 0.12591369959915114


8482it [00:14, 587.89it/s]


Alpha: 0.14, Recall@5: 0.1262673897665645


8482it [00:13, 607.03it/s]


Alpha: 0.15000000000000002, Recall@5: 0.12673897665644895


8482it [00:14, 594.37it/s]


Alpha: 0.16, Recall@5: 0.12685687337892007


8482it [00:14, 568.84it/s]


Alpha: 0.17, Recall@5: 0.1270926668238623


8482it [00:14, 570.41it/s]


Alpha: 0.18000000000000002, Recall@5: 0.12768215043621786


8482it [00:14, 573.94it/s]


Alpha: 0.19, Recall@5: 0.12756425371374674


8482it [00:14, 598.63it/s]


Alpha: 0.2, Recall@5: 0.12756425371374674


8482it [00:14, 581.43it/s]


Alpha: 0.21000000000000002, Recall@5: 0.12756425371374674


8482it [00:14, 581.39it/s]


Alpha: 0.22, Recall@5: 0.12756425371374674


8482it [00:14, 585.04it/s]


Alpha: 0.23, Recall@5: 0.12756425371374674


8482it [00:14, 575.38it/s]


Alpha: 0.24000000000000002, Recall@5: 0.12744635699127566


8482it [00:15, 564.69it/s]


Alpha: 0.25, Recall@5: 0.12780004715868898


8482it [00:14, 587.79it/s]


Alpha: 0.26, Recall@5: 0.1279179438811601


8482it [00:14, 593.96it/s]


Alpha: 0.27, Recall@5: 0.12827163404857345


8482it [00:13, 607.42it/s]


Alpha: 0.28, Recall@5: 0.12815373732610233


8482it [00:14, 595.59it/s]


Alpha: 0.29000000000000004, Recall@5: 0.12815373732610233


8482it [00:14, 593.63it/s]


Alpha: 0.3, Recall@5: 0.1280358406036312


8482it [00:14, 592.94it/s]


Alpha: 0.31, Recall@5: 0.12815373732610233


8482it [00:14, 597.01it/s]


Alpha: 0.32, Recall@5: 0.1279179438811601


8482it [00:14, 590.71it/s]


Alpha: 0.33, Recall@5: 0.1279179438811601


8482it [00:14, 592.12it/s]


Alpha: 0.34, Recall@5: 0.12732846026880454


8482it [00:14, 574.37it/s]


Alpha: 0.35000000000000003, Recall@5: 0.1269747701013912


8482it [00:14, 571.74it/s]


Alpha: 0.36000000000000004, Recall@5: 0.12732846026880454


8482it [00:14, 586.17it/s]


Alpha: 0.37, Recall@5: 0.12744635699127566


8482it [00:14, 571.02it/s]


Alpha: 0.38, Recall@5: 0.12756425371374674


8482it [00:14, 583.28it/s]


Alpha: 0.39, Recall@5: 0.12732846026880454


8482it [00:14, 584.60it/s]


Alpha: 0.4, Recall@5: 0.12756425371374674


8482it [00:15, 565.13it/s]


Alpha: 0.41000000000000003, Recall@5: 0.12756425371374674


8482it [00:14, 580.33it/s]


Alpha: 0.42000000000000004, Recall@5: 0.12768215043621786


8482it [00:14, 572.85it/s]


Alpha: 0.43, Recall@5: 0.12768215043621786


8482it [00:14, 577.50it/s]


Alpha: 0.44, Recall@5: 0.1279179438811601


8482it [00:14, 590.64it/s]


Alpha: 0.45, Recall@5: 0.12780004715868898


8482it [00:14, 599.56it/s]


Alpha: 0.46, Recall@5: 0.1280358406036312


8482it [00:14, 599.31it/s]


Alpha: 0.47000000000000003, Recall@5: 0.1280358406036312


8482it [00:14, 582.87it/s]


Alpha: 0.48000000000000004, Recall@5: 0.12827163404857345


8482it [00:14, 587.30it/s]


Alpha: 0.49, Recall@5: 0.12815373732610233


8482it [00:14, 588.57it/s]


Alpha: 0.5, Recall@5: 0.12838953077104456


8482it [00:14, 596.10it/s]


Alpha: 0.51, Recall@5: 0.12827163404857345


8482it [00:14, 600.68it/s]


Alpha: 0.52, Recall@5: 0.12838953077104456


8482it [00:14, 592.92it/s]


Alpha: 0.53, Recall@5: 0.12850742749351568


8482it [00:14, 596.58it/s]


Alpha: 0.54, Recall@5: 0.12874322093845791


8482it [00:13, 605.99it/s]


Alpha: 0.55, Recall@5: 0.12909691110587126


8482it [00:14, 596.24it/s]


Alpha: 0.56, Recall@5: 0.12909691110587126


8482it [00:14, 595.53it/s]


Alpha: 0.5700000000000001, Recall@5: 0.12909691110587126


8482it [00:14, 601.33it/s]


Alpha: 0.5800000000000001, Recall@5: 0.12897901438340015


8482it [00:14, 578.97it/s]


Alpha: 0.59, Recall@5: 0.12874322093845791


8482it [00:14, 595.73it/s]


Alpha: 0.6, Recall@5: 0.1286253242159868


8482it [00:14, 598.26it/s]


Alpha: 0.61, Recall@5: 0.12874322093845791


8482it [00:14, 591.36it/s]


Alpha: 0.62, Recall@5: 0.1293327045508135


8482it [00:14, 597.97it/s]


Alpha: 0.63, Recall@5: 0.12909691110587126


8482it [00:14, 595.13it/s]


Alpha: 0.64, Recall@5: 0.12909691110587126


8482it [00:14, 585.40it/s]


Alpha: 0.65, Recall@5: 0.1286253242159868


8482it [00:14, 572.35it/s]


Alpha: 0.66, Recall@5: 0.12827163404857345


8482it [00:14, 589.29it/s]


Alpha: 0.67, Recall@5: 0.1286253242159868


8482it [00:14, 577.58it/s]


Alpha: 0.68, Recall@5: 0.1286253242159868


8482it [00:14, 582.36it/s]


Alpha: 0.6900000000000001, Recall@5: 0.12827163404857345


8482it [00:14, 597.36it/s]


Alpha: 0.7000000000000001, Recall@5: 0.1279179438811601


8482it [00:14, 597.45it/s]


Alpha: 0.7100000000000001, Recall@5: 0.12780004715868898


8482it [00:14, 603.56it/s]


Alpha: 0.72, Recall@5: 0.1279179438811601


8482it [00:14, 592.42it/s]


Alpha: 0.73, Recall@5: 0.12780004715868898


8482it [00:14, 588.72it/s]


Alpha: 0.74, Recall@5: 0.12815373732610233


8482it [00:14, 583.05it/s]


Alpha: 0.75, Recall@5: 0.12850742749351568


8482it [00:14, 588.08it/s]


Alpha: 0.76, Recall@5: 0.1280358406036312


8482it [00:14, 595.81it/s]


Alpha: 0.77, Recall@5: 0.1279179438811601


8482it [00:14, 590.79it/s]


Alpha: 0.78, Recall@5: 0.12768215043621786


8482it [00:14, 574.83it/s]


Alpha: 0.79, Recall@5: 0.12768215043621786


8482it [00:14, 588.46it/s]


Alpha: 0.8, Recall@5: 0.12780004715868898


8482it [00:14, 586.03it/s]


Alpha: 0.81, Recall@5: 0.12768215043621786


8482it [00:14, 589.58it/s]


Alpha: 0.8200000000000001, Recall@5: 0.1280358406036312


8482it [00:14, 588.07it/s]


Alpha: 0.8300000000000001, Recall@5: 0.12768215043621786


8482it [00:14, 584.80it/s]


Alpha: 0.8400000000000001, Recall@5: 0.12768215043621786


8482it [00:14, 589.52it/s]


Alpha: 0.85, Recall@5: 0.12780004715868898


8482it [00:14, 604.50it/s]


Alpha: 0.86, Recall@5: 0.12768215043621786


8482it [00:14, 596.09it/s]


Alpha: 0.87, Recall@5: 0.1279179438811601


8482it [00:14, 579.50it/s]


Alpha: 0.88, Recall@5: 0.12827163404857345


8482it [00:14, 579.88it/s]


Alpha: 0.89, Recall@5: 0.12838953077104456


8482it [00:13, 627.18it/s]


Alpha: 0.9, Recall@5: 0.12827163404857345


8482it [00:13, 615.79it/s]


Alpha: 0.91, Recall@5: 0.1280358406036312


8482it [00:13, 609.86it/s]


Alpha: 0.92, Recall@5: 0.12838953077104456


8482it [00:14, 605.71it/s]


Alpha: 0.93, Recall@5: 0.12838953077104456


8482it [00:13, 619.93it/s]


Alpha: 0.9400000000000001, Recall@5: 0.12827163404857345


8482it [00:13, 610.79it/s]


Alpha: 0.9500000000000001, Recall@5: 0.12827163404857345


8482it [00:14, 602.05it/s]


Alpha: 0.9600000000000001, Recall@5: 0.12827163404857345


8482it [00:13, 618.27it/s]


Alpha: 0.97, Recall@5: 0.12815373732610233


8482it [00:13, 615.51it/s]


Alpha: 0.98, Recall@5: 0.1280358406036312


8482it [00:13, 620.07it/s]


Alpha: 0.99, Recall@5: 0.12768215043621786
Best Alpha: 0.62, Best Recall@5: 0.1293327045508135


In [12]:
val_prediction = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])

In [74]:
recall5(val,val_prediction)

0.1279179438811601

In [75]:
recall5(val,val_prediction)

0.1279179438811601

In [13]:
alpha = 0.6200000000000001
pred_recommendations = []
for idx, user in tqdm(enumerate(pred_user_item_matrix.index)):
    applied_jobs = set(pred_user_item_matrix.loc[user][pred_user_item_matrix.loc[user] == 1].index)
    
    sorted_job_indices = (pred_item_predicted_scores.loc[user].values * alpha + pred_user_predicted_scores[idx]).argsort()[::-1]
    recommended_jobs = [job for job in pred_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]
    
    for job in recommended_jobs:
        pred_recommendations.append([user, job])

8482it [00:17, 490.88it/s]


In [14]:
top_recommendations = pd.DataFrame(pred_recommendations, columns=['resume_seq', 'recruitment_seq'])
top_recommendations.to_csv('E:/국민대/open/models/baseline_add_item_6200000000000001.csv', index=False)