In [None]:
def recall5(answer_df, submission_df):
    """
    Calculate recall@5 for given dataframes.

    Parameters:
    - answer_df: DataFrame containing the ground truth
    - submission_df: DataFrame containing the predictions

    Returns:
    - recall: Recall@5 value
    """

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # Check if each primary_col entry has exactly 5 secondary_col predictions
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # Check for NULL values in the predicted secondary_col
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # Check for duplicates in the predicted secondary_col for each primary_col
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

In [None]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from collections import defaultdict
from sklearn.decomposition import TruncatedSVD, NMF, SparsePCA
from sklearn.metrics.pairwise import cosine_similarity

!pip install bayesian-optimization
from bayes_opt import BayesianOptimization

In [None]:
apply_train = pd.read_csv('/kaggle/input/kookmin/apply_train.csv')

In [None]:
#학습, 검증 분리
train, val = [], []
apply_train_groupby = apply_train.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(apply_train_groupby.index.tolist(), apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        train.append([uid,iid])
    val.append([uid, iids[-1]])

In [None]:
train = pd.DataFrame(train, columns=['resume_seq', 'recruitment_seq'])
val = pd.DataFrame(val, columns=['resume_seq', 'recruitment_seq'])
pred = apply_train.copy()

In [None]:
train_user_item_matrix = train.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
pred_user_item_matrix = pred.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)

train_user_similarity = cosine_similarity(train_user_item_matrix)
train_item_similarity = cosine_similarity(train_user_item_matrix.T)

pred_user_similarity = cosine_similarity(pred_user_item_matrix)
pred_item_similarity = cosine_similarity(pred_user_item_matrix.T)

train_user_predicted_scores = train_user_similarity.dot(train_user_item_matrix)
train_item_predicted_scores = train_user_item_matrix.dot(train_item_similarity)

pred_user_predicted_scores = pred_user_similarity.dot(pred_user_item_matrix)
pred_item_predicted_scores = pred_user_item_matrix.dot(pred_item_similarity)

resume_df = pd.read_csv('/kaggle/input/kookmin/resume.csv')
resume_education_df = pd.read_csv('/kaggle/input/kookmin/resume_education.csv')
resume_df.fillna('NONE', inplace=True)
resume_education_df.fillna('NONE', inplace=True)

resume_seq = list(resume_df['resume_seq'])
degree = list(resume_df['degree'])
job_code_seq1 = list(resume_df['job_code_seq1'])
career_job_code = list(resume_df['career_job_code'])
career_month = list(resume_df['career_month'])
hischool_type_seq = list(resume_education_df['hischool_type_seq'])
univ_type_seq1 = list(resume_education_df['univ_type_seq1'])
univ_major_type = list(resume_education_df['univ_major_type'])

degree_data = {'resume_seq' : resume_seq,
               'degree' : degree}
job_code_seq1_data = {'resume_seq' : resume_seq,
                      'job_code_seq1' : job_code_seq1}
career_job_code_data = {'resume_seq' : resume_seq,
                       'career_job_code' : career_job_code}

career_month_data = {'resume_seq' : resume_seq,
                      'career_month' : career_month}
hischool_type_seq_data = {'resume_seq' : resume_seq,
                      'hischool_type_seq' : hischool_type_seq}
univ_type_seq1_data = {'resume_seq' : resume_seq,
                      'univ_type_seq1' : univ_type_seq1}
univ_major_type_data = {'resume_seq' : resume_seq,
                      'univ_major_type' : univ_major_type}

degree_df = pd.DataFrame(degree_data)
job_code_seq1_df = pd.DataFrame(job_code_seq1_data)
career_job_code_df = pd.DataFrame(career_job_code_data)
career_month_df = pd.DataFrame(career_month_data)
for i in range(len(list(career_month_df['career_month']))):
    career_month_df['career_month'][i] = (career_month_df['career_month'][i] // 12) * 12
hischool_type_seq_df = pd.DataFrame(hischool_type_seq_data)
univ_type_seq1_df = pd.DataFrame(univ_type_seq1_data)
univ_major_type_df = pd.DataFrame(univ_major_type_data)

In [None]:
degree_matrix = degree_df.groupby(['resume_seq', 'degree']).size().unstack(fill_value=0)
job_code_seq1_matrix = job_code_seq1_df.groupby(['resume_seq', 'job_code_seq1']).size().unstack(fill_value=0)
career_job_code_matrix = career_job_code_df.groupby(['resume_seq', 'career_job_code']).size().unstack(fill_value=0)
career_month_matrix = career_month_df.groupby(['resume_seq', 'career_month']).size().unstack(fill_value=0)
hischool_type_seq_matrix = hischool_type_seq_df.groupby(['resume_seq', 'hischool_type_seq']).size().unstack(fill_value=0)
univ_type_seq1_matrix = univ_type_seq1_df.groupby(['resume_seq', 'univ_type_seq1']).size().unstack(fill_value=0)
univ_major_type_matrix = univ_major_type_df.groupby(['resume_seq', 'univ_major_type']).size().unstack(fill_value=0)

degree_similarity = cosine_similarity(degree_matrix)
job_code_seq1_similarity = cosine_similarity(job_code_seq1_matrix)
career_job_code_similarity = cosine_similarity(career_job_code_matrix)
career_month_similarity = cosine_similarity(career_month_matrix)
hischool_type_seq_similarity = cosine_similarity(hischool_type_seq_matrix)
univ_type_seq1_similarity = cosine_similarity(univ_type_seq1_matrix)
univ_major_type_similarity = cosine_similarity(univ_major_type_matrix)

pred_degree_predicted_scores = degree_similarity.dot(pred_user_item_matrix)
pred_job_predicted_scores = job_code_seq1_similarity.dot(pred_user_item_matrix)
pred_career_predicted_scores = career_job_code_similarity.dot(pred_user_item_matrix)
pred_month_predicted_scores = career_month_similarity.dot(pred_user_item_matrix)
pred_hischool_predicted_scores = hischool_type_seq_similarity.dot(pred_user_item_matrix)
pred_univ_predicted_scores = univ_type_seq1_similarity.dot(pred_user_item_matrix)
pred_major_predicted_scores = univ_major_type_similarity.dot(pred_user_item_matrix)

train_degree_predicted_scores = degree_similarity.dot(train_user_item_matrix)
train_job_predicted_scores = job_code_seq1_similarity.dot(train_user_item_matrix)
train_career_predicted_scores = career_job_code_similarity.dot(train_user_item_matrix)
train_month_predicted_scores = career_month_similarity.dot(train_user_item_matrix)
train_hischool_predicted_scores = hischool_type_seq_similarity.dot(train_user_item_matrix)
train_univ_predicted_scores = univ_type_seq1_similarity.dot(train_user_item_matrix)
train_major_predicted_scores = univ_major_type_similarity.dot(train_user_item_matrix)

train_item_predicted_scores = train_item_predicted_scores / np.array([np.abs(train_item_predicted_scores).sum(axis=1)]).T
train_user_predicted_scores = train_user_predicted_scores / np.array([np.abs(train_user_predicted_scores).sum(axis=1)]).T
train_degree_predicted_scores = train_degree_predicted_scores / np.array([np.abs(train_degree_predicted_scores).sum(axis=1)]).T
train_job_predicted_scores = train_job_predicted_scores / np.array([np.abs(train_job_predicted_scores).sum(axis=1)]).T
train_career_predicted_scores = train_career_predicted_scores / np.array([np.abs(train_career_predicted_scores).sum(axis=1)]).T
train_month_predicted_scores = train_month_predicted_scores / np.array([np.abs(train_month_predicted_scores).sum(axis=1)]).T
train_hischool_predicted_scores = train_hischool_predicted_scores / np.array([np.abs(train_hischool_predicted_scores).sum(axis=1)]).T
train_univ_predicted_scores = train_univ_predicted_scores / np.array([np.abs(train_univ_predicted_scores).sum(axis=1)]).T
train_major_predicted_scores = train_major_predicted_scores / np.array([np.abs(train_major_predicted_scores).sum(axis=1)]).T

pred_item_predicted_scores = pred_item_predicted_scores / np.array([np.abs(pred_item_predicted_scores).sum(axis=1)]).T
pred_user_predicted_scores = pred_user_predicted_scores / np.array([np.abs(pred_user_predicted_scores).sum(axis=1)]).T
pred_degree_predicted_scores = pred_degree_predicted_scores / np.array([np.abs(pred_degree_predicted_scores).sum(axis=1)]).T
pred_job_predicted_scores = pred_job_predicted_scores / np.array([np.abs(pred_job_predicted_scores).sum(axis=1)]).T
pred_career_predicted_scores = pred_career_predicted_scores / np.array([np.abs(pred_career_predicted_scores).sum(axis=1)]).T
pred_month_predicted_scores = pred_month_predicted_scores / np.array([np.abs(pred_month_predicted_scores).sum(axis=1)]).T
pred_hischool_predicted_scores = pred_hischool_predicted_scores / np.array([np.abs(pred_hischool_predicted_scores).sum(axis=1)]).T
pred_univ_predicted_scores = pred_univ_predicted_scores / np.array([np.abs(pred_univ_predicted_scores).sum(axis=1)]).T
pred_major_predicted_scores = pred_major_predicted_scores / np.array([np.abs(pred_major_predicted_scores).sum(axis=1)]).T

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [None]:
text_keyword = list(resume_df['text_keyword'])
text_keyword_new = []

for i in text_keyword:
  i = i.replace(";"," ")
  text_keyword_new.append(i)

In [None]:
print(text_keyword_new)

In [None]:
tf_idf_model = TfidfVectorizer().fit(text_keyword_new)

word_id_list = sorted(tf_idf_model.vocabulary_.items(), key=lambda x: x[1], reverse=False)
word_list = [x[0] for x in word_id_list]

tf_idf_df = pd.DataFrame(tf_idf_model.transform(text_keyword_new).toarray(), columns = word_list, index = resume_seq)

print(tf_idf_df)

In [None]:
text_keyword_matrix = tf_idf_df
text_keyword_similarity = cosine_similarity(text_keyword_matrix)
pred_keyword_predicted_scores = text_keyword_similarity.dot(pred_user_item_matrix)
train_keyword_predicted_scores = text_keyword_similarity.dot(train_user_item_matrix)

train_keyword_predicted_scores = train_keyword_predicted_scores / np.array([np.abs(train_keyword_predicted_scores).sum(axis=1)]).T
pred_keyword_predicted_scores = pred_keyword_predicted_scores / np.array([np.abs(pred_keyword_predicted_scores).sum(axis=1)]).T

In [None]:
print(max(np.array([np.abs(train_item_predicted_scores).sum(axis=1)]).T))
print(max(np.array([np.abs(train_user_predicted_scores).sum(axis=1)]).T))
print(max(np.array([np.abs(train_degree_predicted_scores).sum(axis=1)]).T))
print(max(np.array([np.abs(train_job_predicted_scores).sum(axis=1)]).T))
print(max(np.array([np.abs(train_career_predicted_scores).sum(axis=1)]).T))
print(max(np.array([np.abs(train_month_predicted_scores).sum(axis=1)]).T))
print(max(np.array([np.abs(train_hischool_predicted_scores).sum(axis=1)]).T))
print(max(np.array([np.abs(train_univ_predicted_scores).sum(axis=1)]).T))
print(max(np.array([np.abs(train_major_predicted_scores).sum(axis=1)]).T))
print(max(np.array([np.abs(train_keyword_predicted_scores).sum(axis=1)]).T))

In [None]:
print(pd.DataFrame(train_item_predicted_scores))
print(pd.DataFrame(train_user_predicted_scores))
print(pd.DataFrame(train_degree_predicted_scores))
print(pd.DataFrame(train_job_predicted_scores))
print(pd.DataFrame(train_career_predicted_scores))
print(pd.DataFrame(train_month_predicted_scores))
print(pd.DataFrame(train_hischool_predicted_scores))
print(pd.DataFrame(train_univ_predicted_scores))
print(pd.DataFrame(train_major_predicted_scores))
print(pd.DataFrame(train_keyword_predicted_scores))

In [None]:
alpha = 1000
beta = 750
delta = 69.62
epsilon = 0
eta = 100
gamma = 0
iota = 100
kappa = 0
theta = 46.7
zeta = 0


train_recommendations = []
for idx, user in tqdm(enumerate(train_user_item_matrix.index)):
    applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)

    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_job_indices = (train_user_predicted_scores[idx] * alpha
                              + train_item_predicted_scores.loc[user].values  * beta
                              + train_degree_predicted_scores[idx] * gamma
                              + train_job_predicted_scores[idx] * delta
                              + train_career_predicted_scores[idx] * epsilon
                              + train_month_predicted_scores[idx] * zeta
                              + train_hischool_predicted_scores[idx] * eta
                              + train_univ_predicted_scores[idx] * theta
                              + train_major_predicted_scores[idx] * iota 
                              + train_keyword_predicted_scores[idx] * kappa).argsort()[::-1]
    recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]# and (resume_dictionary[user] >= recruitment_dictionary[job])]
    for job in recommended_jobs:
        train_recommendations.append([user, job])

In [None]:
val_prediction = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])
recall5(val,val_prediction)

In [None]:
parameter_bounds = {
    'alpha' : (0, 100),
    'beta' : (0, 100),
    'gamma' : (0, 100),
    'delta' : (0, 100),
    'epsilon' : (0, 100),
    'zeta': (0, 100),
    'eta' : (0, 100),
    'theta' : (0, 100),
    'iota' : (0, 100),
    'kappa' : (0, 100)
    }

def bo(alpha, beta, gamma, delta, epsilon, zeta, eta, theta, iota, kappa): #
    train_recommendations = []
    for idx, user in tqdm(enumerate(train_user_item_matrix.index)):
        applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)

    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
        sorted_job_indices = (train_user_predicted_scores[idx] * alpha
                              + train_item_predicted_scores.loc[user].values  * beta
                              + train_degree_predicted_scores[idx] * gamma
                              + train_job_predicted_scores[idx] * delta
                              + train_career_predicted_scores[idx] * epsilon
                              + train_month_predicted_scores[idx] * zeta
                              + train_hischool_predicted_scores[idx] * eta
                              + train_univ_predicted_scores[idx] * theta
                              + train_major_predicted_scores[idx] * iota 
                              + train_keyword_predicted_scores[idx] * kappa).argsort()[::-1]
        recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]# and (resume_dictionary[user] >= recruitment_dictionary[job])]
        for job in recommended_jobs:
            train_recommendations.append([user, job])

    val_prediction = pd.DataFrame(train_recommendations, columns=['resume_seq', 'recruitment_seq'])

    score = recall5(val,val_prediction)

    return score

BO = BayesianOptimization(f = bo, pbounds = parameter_bounds, random_state = 0)
BO.maximize(init_points = 250, n_iter = 500)

In [None]:
alpha = 1000
beta = 700
delta = 69.62
epsilon = 0
eta = 100
gamma = 0
iota = 100
kappa = 0
theta = 46.7
zeta = 0

pred_recommendations = []
for idx, user in tqdm(enumerate(pred_user_item_matrix.index)):
    applied_jobs = set(pred_user_item_matrix.loc[user][pred_user_item_matrix.loc[user] == 1].index)

    sorted_job_indices = (pred_user_predicted_scores[idx] * alpha
                              + pred_item_predicted_scores.loc[user].values  * beta
                              + pred_degree_predicted_scores[idx] * gamma
                              + pred_job_predicted_scores[idx] * delta
                              + pred_career_predicted_scores[idx] * epsilon
                              + pred_month_predicted_scores[idx] * zeta
                              + pred_hischool_predicted_scores[idx] * eta
                              + pred_univ_predicted_scores[idx] * theta
                              + pred_major_predicted_scores[idx] * iota 
                              + pred_keyword_predicted_scores[idx] * kappa).argsort()[::-1]
    recommended_jobs = [job for job in pred_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]# and (resume_dictionary[user] >= recruitment_dictionary[job])]

    for job in recommended_jobs:
        pred_recommendations.append([user, job])

In [None]:
top_recommendations = pd.DataFrame(pred_recommendations, columns=['resume_seq', 'recruitment_seq'])
top_recommendations.to_csv('alpha_to_epsilon_last_final.csv', index=False)