## Import

In [1]:
pip install -U ipykernel

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Data Load

In [6]:
apply_train_df = pd.read_csv('apply_train.csv')

## User-Item Matrix / Similarity / Score

In [47]:
# 사용자-아이템 행렬 생성: 구직자가 해당 채용 공고에 지원했으면 1, 아니면 0으로 설정
user_item_matrix = apply_train_df.groupby(['resume_seq', 'recruitment_seq']).size().unstack(fill_value=0)
user_item_matrix[user_item_matrix > 1] = 1
print(user_item_matrix)

recruitment_seq  R00001  R00002  R00003  R00004  R00005  R00006  R00007  \
resume_seq                                                                
U00001                0       0       0       0       0       0       0   
U00002                0       0       0       0       0       0       0   
U00003                0       0       0       0       0       0       0   
U00004                0       0       0       1       0       0       0   
U00005                0       0       0       0       0       0       0   
...                 ...     ...     ...     ...     ...     ...     ...   
U08478                0       0       0       0       0       0       0   
U08479                0       0       0       0       0       0       0   
U08480                0       0       0       0       0       0       0   
U08481                0       0       0       0       0       0       0   
U08482                0       0       0       0       0       0       0   

recruitment_seq  R00008 

In [10]:
# 사용자 간의 유사성 계산
user_similarity = cosine_similarity(user_item_matrix)
print(user_similarity)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [12]:
np.abs(user_similarity)

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [16]:
np.array([np.abs(user_similarity).sum(axis=1)]).T

array([[ 7.85696514],
       [12.01503866],
       [ 5.6812247 ],
       ...,
       [ 3.76220475],
       [ 5.32436944],
       [ 9.23056886]])

In [21]:
# 추천 점수 계산
user_predicted_scores = user_similarity.dot(user_item_matrix) / np.array([np.abs(user_similarity).sum(axis=1)]).T
print(user_predicted_scores)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Prediction

In [22]:
user_item_matrix.index

Index(['U00001', 'U00002', 'U00003', 'U00004', 'U00005', 'U00006', 'U00007',
       'U00008', 'U00009', 'U00010',
       ...
       'U08473', 'U08474', 'U08475', 'U08476', 'U08477', 'U08478', 'U08479',
       'U08480', 'U08481', 'U08482'],
      dtype='object', name='resume_seq', length=8482)

In [23]:
for idx, user in enumerate(user_item_matrix.index):
    print(idx, user)

0 U00001
1 U00002
2 U00003
3 U00004
4 U00005
5 U00006
6 U00007
7 U00008
8 U00009
9 U00010
10 U00011
11 U00012
12 U00013
13 U00014
14 U00015
15 U00016
16 U00017
17 U00018
18 U00019
19 U00020
20 U00021
21 U00022
22 U00023
23 U00024
24 U00025
25 U00026
26 U00027
27 U00028
28 U00029
29 U00030
30 U00031
31 U00032
32 U00033
33 U00034
34 U00035
35 U00036
36 U00037
37 U00038
38 U00039
39 U00040
40 U00041
41 U00042
42 U00043
43 U00044
44 U00045
45 U00046
46 U00047
47 U00048
48 U00049
49 U00050
50 U00051
51 U00052
52 U00053
53 U00054
54 U00055
55 U00056
56 U00057
57 U00058
58 U00059
59 U00060
60 U00061
61 U00062
62 U00063
63 U00064
64 U00065
65 U00066
66 U00067
67 U00068
68 U00069
69 U00070
70 U00071
71 U00072
72 U00073
73 U00074
74 U00075
75 U00076
76 U00077
77 U00078
78 U00079
79 U00080
80 U00081
81 U00082
82 U00083
83 U00084
84 U00085
85 U00086
86 U00087
87 U00088
88 U00089
89 U00090
90 U00091
91 U00092
92 U00093
93 U00094
94 U00095
95 U00096
96 U00097
97 U00098
98 U00099
99 U00100
100 U00101

In [34]:
# 이미 지원한 채용 공고 제외하고 추천
recommendations = []
for idx, user in enumerate(user_item_matrix.index):
    # 해당 사용자가 지원한 채용 공고
    applied_jobs = set(user_item_matrix.loc[user][user_item_matrix.loc[user] == 1].index)
    
    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_job_indices = user_predicted_scores[idx].argsort()[::-1] # argsort(): 원본 행렬을 sorting 하고 원래 인덱스의 값을 반환
    print(sorted_job_indices)

[4535 5287 5209 ... 4407 4408    0]
[1102 5471 6018 ... 4357 4358    0]
[3300 1459 4917 ... 4452 4453    0]
[3924 4708 2354 ... 4207 4208    0]
[ 373  831 3913 ... 4394 4395    0]
[4457 2994 1488 ... 4143 4144    0]
[ 634 2792 3748 ... 4438 4439    0]
[1624 3637 3938 ... 3192 3193 6694]
[2877 5199 5855 ... 4425 4426    0]
[ 276 1430 1793 ... 4188 4189 6694]
[1891 5773 3411 ... 4408 4409    0]
[4419 6445 3905 ... 4179 4180    0]
[2138 1642 4294 ... 4439 4440    0]
[4073 5600 2411 ... 4343 4345    0]
[2592  826 6293 ... 4379 4380    0]
[ 315 5563 4182 ... 4384 4385 3347]
[5713 3417 1457 ... 4365 4366    0]
[ 943  252 4106 ... 4348 4349    0]
[5994 2495  121 ... 4356 4357    0]
[2189   71 4526 ... 4357 4358    0]
[2206 4251 1809 ... 4297 4298 3347]
[4967 3473 3109 ... 4430 4431    0]
[3307 1063 4194 ... 4420 4421    0]
[2324 2400 3045 ... 4454 4455 3347]
[ 638 5423 1278 ... 4424 4425    0]
[3557 5901 6519 ... 4418 4420    0]
[ 913 5795 2965 ... 4370 4371    0]
[1384 2279 2441 ... 4439 444

In [45]:
# 이미 지원한 채용 공고 제외하고 추천
recommendations = []
for idx, user in enumerate(user_item_matrix.index):
    # 해당 사용자가 지원한 채용 공고
    applied_jobs = set(user_item_matrix.loc[user][user_item_matrix.loc[user] == 1].index)
    
    # 해당 사용자의 추천 점수 (높은 점수부터 정렬)
    sorted_job_indices = user_predicted_scores[idx].argsort()[::-1]
    recommended_jobs = [job for job in user_item_matrix.columns[sorted_job_indices] if job not in applied_jobs][:5]
    
    for job in recommended_jobs:
        recommendations.append([user, job])
print(recommendations[0])

['U00001', 'R01528']


## Submission

In [46]:
# sample_submission.csv 형태로 DataFrame 생성
top_recommendations = pd.DataFrame(recommendations, columns=['resume_seq', 'recruitment_seq'])

top_recommendations.to_csv('./baseline_submit.csv', index=False)