## Data check and pre-processing

In [1]:
import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from collections import defaultdict
from sklearn.decomposition import TruncatedSVD, NMF, SparsePCA
from sklearn.metrics.pairwise import cosine_similarity

import os

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
os.chdir('/content/drive/MyDrive/DACON_국민대AI/work/mysong')

In [5]:
recruitment = pd.read_csv("../../data/original/recruitment.csv")
company = pd.read_csv("../../data/original/company.csv")

# Merge recruitment with company
merged_recruitment = pd.merge(recruitment, company, left_on='recruitment_seq', right_on='recruitment_seq', how='left')

# rename column ('text_keyword' --> 'recruitment_text_keyword')
merged_recruitment = merged_recruitment.rename(columns={'text_keyword' : 'recruitment_text_keyword'})

merged_recruitment

Unnamed: 0,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,qualifications,recruitment_text_keyword,company_type_seq,supply_kind,employee
0,R02264,3.0,,,0,0,2507;2707;2810,4,8,1,,5.0,402.0,800.0
1,R06317,3.0,,,0,0,2204;2205;2707,3,2,1,,,,
2,R04017,3.0,,,0,0,2101;2108;2201;2707,3,2,1,,,,
3,R02865,3.0,,,0,0,2201;2204;2205;2707,2,2,1,,,,
4,R04890,3.0,,,0,0,2201;2204;2205;2707,2,2,2,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,R03678,3.0,,,0,0,2101;2108;2201;2204;2205;2707,3,2,1,,,,
6691,R04593,3.0,,,0,0,2201;2204;2205;2707,4,2,1,,,,
6692,R03252,3.0,,,0,0,2109,3,2,1,,4.0,402.0,525.0
6693,R05130,3.0,,,0,0,2201;2204;2205;2707,2,2,2,,2.0,402.0,40.0


In [6]:
merged_recruitment = merged_recruitment.sort_values('recruitment_seq').reset_index(drop=True)
merged_recruitment

Unnamed: 0,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,qualifications,recruitment_text_keyword,company_type_seq,supply_kind,employee
0,R00001,5.0,,,0,0,2101;2108;2201;2204;2205;2707;2810,2,2,1,,5.0,201.0,631.0
1,R00002,3.0,,,0,0,2507;2703;2707,3,2,1,,2.0,201.0,160.0
2,R00003,3.0,,,0,0,2101;2108;2201;2707,3,2,2,,,,
3,R00004,3.0,,,0,0,2507;2707,3,2,1,,2.0,402.0,500.0
4,R00005,3.0,,,0,0,2507;2707,3,2,1,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,R06691,3.0,,,0,0,2501;2507;2707,3,2,1,,,,
6691,R06692,3.0,,,0,0,2201;2507,3,2,2,,4.0,402.0,150.0
6692,R06693,5.0,,,0,0,2102;2707,4,2,1,,,,
6693,R06694,3.0,,,0,0,2101;2108;2109;2110;2201;2203;2707,4,2,1,,,,


In [7]:
len(merged_recruitment['recruitment_seq'].unique())

6695

In [8]:
merged_recruitment['Content'] = merged_recruitment['check_box_keyword'] + ';' + merged_recruitment['recruitment_text_keyword'].fillna('')
merged_recruitment['Content'] = merged_recruitment['Content'].astype(str).apply(lambda x:x.replace(';', ' '))

merged_recruitment

Unnamed: 0,recruitment_seq,address_seq1,address_seq2,address_seq3,career_end,career_start,check_box_keyword,education,major_task,qualifications,recruitment_text_keyword,company_type_seq,supply_kind,employee,Content
0,R00001,5.0,,,0,0,2101;2108;2201;2204;2205;2707;2810,2,2,1,,5.0,201.0,631.0,2101 2108 2201 2204 2205 2707 2810
1,R00002,3.0,,,0,0,2507;2703;2707,3,2,1,,2.0,201.0,160.0,2507 2703 2707
2,R00003,3.0,,,0,0,2101;2108;2201;2707,3,2,2,,,,,2101 2108 2201 2707
3,R00004,3.0,,,0,0,2507;2707,3,2,1,,2.0,402.0,500.0,2507 2707
4,R00005,3.0,,,0,0,2507;2707,3,2,1,,,,,2507 2707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6690,R06691,3.0,,,0,0,2501;2507;2707,3,2,1,,,,,2501 2507 2707
6691,R06692,3.0,,,0,0,2201;2507,3,2,2,,4.0,402.0,150.0,2201 2507
6692,R06693,5.0,,,0,0,2102;2707,4,2,1,,,,,2102 2707
6693,R06694,3.0,,,0,0,2101;2108;2109;2110;2201;2203;2707,4,2,1,,,,,2101 2108 2109 2110 2201 2203 2707


In [None]:
merged_recruitment.to_csv('content_based_data.csv', index=False, encoding='utf-8-sig')

## Content-based System 구성하기
* merged_recruitment에 저장한 'check_box_keyword', 'recruitment_text_keyword' 컬럼 사용 (= 'Content' 컬럼 사용)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [10]:
# Use TF-IDF vectorizer to convert content into a matrix of TF-IDF features
tfidf_vectorizer = TfidfVectorizer()
content_matrix = tfidf_vectorizer.fit_transform(merged_recruitment['Content'])

content_similarity = linear_kernel(content_matrix, content_matrix)

In [11]:
def get_content_based_recommendations(product_id):
    index = merged_recruitment[merged_recruitment['recruitment_seq'] == product_id].index[0]
    similarity_scores = content_similarity[index]
    similar_indices = similarity_scores.argsort()[::-1][1:]
    recommendations = merged_recruitment.loc[similar_indices, 'recruitment_seq'].values
    return recommendations

In [None]:
get_content_based_recommendations('R01528')

array(['R05314', 'R01029', 'R05454', ..., 'R02220', 'R03767', 'R02657'],
      dtype=object)

## Recall 계산하기

In [12]:
def recall5(answer_df, submission_df):
    """
    Calculate recall@5 for given dataframes.

    Parameters:
    - answer_df: DataFrame containing the ground truth
    - submission_df: DataFrame containing the predictions

    Returns:
    - recall: Recall@5 value
    """

    primary_col = answer_df.columns[0]
    secondary_col = answer_df.columns[1]

    # Check if each primary_col entry has exactly 5 secondary_col predictions
    prediction_counts = submission_df.groupby(primary_col).size()
    if not all(prediction_counts == 5):
        raise ValueError(f"Each {primary_col} should have exactly 5 {secondary_col} predictions.")


    # Check for NULL values in the predicted secondary_col
    if submission_df[secondary_col].isnull().any():
        raise ValueError(f"Predicted {secondary_col} contains NULL values.")

    # Check for duplicates in the predicted secondary_col for each primary_col
    duplicated_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].duplicated().any())
    if duplicated_preds.any():
        raise ValueError(f"Predicted {secondary_col} contains duplicates for some {primary_col}.")


    # Filter the submission dataframe based on the primary_col present in the answer dataframe
    submission_df = submission_df[submission_df[primary_col].isin(answer_df[primary_col])]

    # For each primary_col, get the top 5 predicted secondary_col values
    top_5_preds = submission_df.groupby(primary_col).apply(lambda x: x[secondary_col].head(5).tolist()).to_dict()

    # Convert the answer_df to a dictionary for easier lookup
    true_dict = answer_df.groupby(primary_col).apply(lambda x: x[secondary_col].tolist()).to_dict()


    individual_recalls = []
    for key, val in true_dict.items():
        if key in top_5_preds:
            correct_matches = len(set(true_dict[key]) & set(top_5_preds[key]))
            individual_recall = correct_matches / min(len(val), 5) # 공정한 평가를 가능하게 위하여 분모(k)를 'min(len(val), 5)' 로 설정함
            individual_recalls.append(individual_recall)


    recall = np.mean(individual_recalls)
    return recall

## Collaborative Filtering System 구성하기
* user-based, item-based 모두 포함

In [13]:
data = pd.read_csv("../../data/original/apply_train.csv")
data

Unnamed: 0,resume_seq,recruitment_seq
0,U05833,R03838
1,U06456,R02144
2,U07807,R01877
3,U04842,R02463
4,U08336,R00112
...,...,...
57941,U02270,R03430
57942,U02640,R04987
57943,U08238,R01342
57944,U01296,R06363


In [14]:
#학습, 검증 분리
train, val = [], []
apply_train_groupby = data.groupby('resume_seq')['recruitment_seq'].apply(list)
for uid, iids in zip(apply_train_groupby.index.tolist(), apply_train_groupby.values.tolist()):
    for iid in iids[:-1]:
        train.append([uid,iid])
    val.append([uid, iids[-1]])

In [15]:
train = pd.DataFrame(train, columns=['resume_seq', 'recruitment_seq'])
val = pd.DataFrame(val, columns=['resume_seq', 'recruitment_seq'])
pred = data.copy()

In [16]:
train_user_item_matrix = train.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
pred_user_item_matrix = pred.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)

In [17]:
train_user_similarity = cosine_similarity(train_user_item_matrix)
train_item_similarity = cosine_similarity(train_user_item_matrix.T)

pred_user_similarity = cosine_similarity(pred_user_item_matrix)
pred_item_similarity = cosine_similarity(pred_user_item_matrix.T)

In [18]:
train_user_predicted_scores = train_user_similarity.dot(train_user_item_matrix)
train_item_predicted_scores = train_user_item_matrix.dot(train_item_similarity)

pred_user_predicted_scores = pred_user_similarity.dot(pred_user_item_matrix)
pred_item_predicted_scores = pred_user_item_matrix.dot(pred_item_similarity)

In [19]:
data = data.sort_values('resume_seq').reset_index(drop=True)

## Hybrid recommendations 최적 조합 찾기 (recall 성능 확인 가능)
* Collaborative Filtering System 포함 (특정 유저와 비슷한 집단이 가장 선호하는 아이템 N개 선정)
* Content-based System 포함 (CF 과정에서 고른 아이템 중 1순위와 가장 유사한 아이템 M개 선정)
* train_recommendations에 N개 + M개 아이템 지정 (단, N + M = 5)

In [20]:
user_info = data['resume_seq'].unique().tolist()

In [48]:
# alpha : '아이템-유저' 추천 점수 가중치 조정
# beta : '유저-아이템' 추천 점수 가중치 조정
alpha = 0.98
beta = 1.0
recommendations_li = []

for idx, user in tqdm(enumerate(user_info)):
  applied_jobs = set(train_user_item_matrix.loc[user][train_user_item_matrix.loc[user] == 1].index)

  # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
  sorted_job_indices = (train_item_predicted_scores.loc[user].values * alpha + train_user_predicted_scores[idx] * beta).argsort()[::-1]
  cf_recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]

  # CF 과정에서 얻은 아이템 추가
  for job in cf_recommended_jobs:
    recommendations_li.append([user, job])
    applied_jobs.add(job)

  # ⚠️ content-based 안 쓸려면 하단 코드 모두 주석 처리하기
  all_content_recommended_jobs = get_content_based_recommendations(cf_recommended_jobs[0])
  content_recommended_jobs = [job for job in all_content_recommended_jobs if job not in applied_jobs][0]

  # Content-based System 과정에서 얻은 아이템 추가
  if str(type(content_recommended_jobs)) == "<class 'str'>": # 아이템이 하나일 경우
    recommendations_li.append([user, content_recommended_jobs])
  else: # 아이템이 여럿일 경우
    for job in content_recommended_jobs:
      recommendations_li.append([user, job])

0it [00:00, ?it/s]

In [50]:
val_prediction = pd.DataFrame(recommendations_li, columns=['resume_seq', 'recruitment_seq'])
val_prediction

Unnamed: 0,resume_seq,recruitment_seq
0,U00001,R03811
1,U00001,R05862
2,U00001,R04769
3,U00001,R03777
4,U00001,R03037
...,...,...
42405,U08482,R04602
42406,U08482,R00473
42407,U08482,R05461
42408,U08482,R01186


In [47]:
# 4개는 CF 추천, 1개는 CF 추천 1위 항목과 유사한 Content-based 추천
recall5(val, val_prediction)

0.11046922895543504

In [51]:
# 5개 모두 CF 추천
recall5(val, val_prediction)

0.12909691110587126

## 제출 파일 csv 만들기

In [None]:
# alpha : '아이템-유저' 추천 점수 가중치 조정
# beta : '유저-아이템' 추천 점수 가중치 조정
alpha = 0.98
beta = 1.0
recommendations_li = []

for idx, user in tqdm(enumerate(user_info)):
  applied_jobs = set(pred_user_item_matrix.loc[user][pred_user_item_matrix.loc[user] == 1].index)

  # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
  sorted_job_indices = (pred_item_predicted_scores.loc[user].values * alpha + pred_user_predicted_scores[idx] * beta).argsort()[::-1]
  cf_recommended_jobs = [job for job in train_user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:4]

  # CF 과정에서 얻은 아이템 추가
  for job in cf_recommended_jobs:
    recommendations_li.append([user, job])
    applied_jobs.add(job)

  # ⚠️ content-based 안 쓸려면 하단 코드 모두 주석 처리하기
  all_content_recommended_jobs = get_content_based_recommendations(cf_recommended_jobs[0])
  content_recommended_jobs = [job for job in all_content_recommended_jobs if job not in applied_jobs][0]

  # Content-based System 과정에서 얻은 아이템 추가
  if str(type(content_recommended_jobs)) == "<class 'str'>": # 아이템이 하나일 경우
    recommendations_li.append([user, content_recommended_jobs])
  else: # 아이템이 여럿일 경우
    for job in content_recommended_jobs:
      recommendations_li.append([user, job])

In [None]:
top_recommendations = pd.DataFrame(recommendations_li, columns=['resume_seq', 'recruitment_seq'])

top_recommendations.to_csv('./hybrid_231111_mysong_submit.csv', index=False)