In [5]:
import matplotlib.pyplot as plt
import csv
import numpy as np
import torch
from torch import nn, optim

In [6]:
from tqdm.notebook import tqdm

In [7]:
import torch.nn.functional as F

In [8]:
from torch.utils.data import Dataset, DataLoader

In [9]:
if torch.cuda.is_available():
    device = torch.device("cuda")  

In [10]:
dataset_train = list()


# 1. Train Data Pairs
with open('train_job/train.csv', 'r') as f:
    csv_reader = csv.reader(f)
    
    for i, line in enumerate(csv_reader):

        if i == 0:
            pass
        else:
            dataset_train.append([line[0], line[1], int(line[2])])

# 2. Job id to number/ Job tags

job_dict = dict()

with open('train_job/job_tags.csv', 'r') as f:
    csv_reader = csv.reader(f)
    
    for i, line in enumerate(csv_reader):
        
        if i == 0:
            pass
        else:
            try:
                job_dict[line[0]].append(line[1])
            except KeyError:
                job_dict[line[0]] = [line[1]]
                
# 3. Job id to company code and size. Not using right now
                
company_set = set()
jobs_set = set()
company_size_set = set()
job_id_to_com_size_dict = dict()

scales = {'': 0, 
          '1-10': 1, 
          '11-50': 2, 
          '51-100': 3, 
          '101-200': 4, 
          '201-500': 5, 
          '501-1000': 6, 
          '1000 이상': 7}

with open('train_job/job_companies.csv', 'r') as f:
    csv_reader = csv.reader(f)
    
    for i, line in enumerate(csv_reader):
        
        if i == 0:
            pass
#             print(line)
        else:
            company_set.add(line[0])
            jobs_set.add(line[1])
            company_size_set.add(line[2])
            job_id_to_com_size_dict[line[1]] = scales[line[2]]

# 4. User skills
            
user_dict = dict()

with open('train_job/user_tags.csv', 'r') as f:
    csv_reader = csv.reader(f)
    
    for i, line in enumerate(csv_reader):
        
        if i != 0:
    
            try:
                user_dict[line[0]].append(line[1])
            except KeyError:
                user_dict[line[0]] = [line[1]]

# 5. Job required skills
                
tag_dictionary = dict()
tag_idx_to_tag_list = list()

with open('train_job/tags.csv', 'r') as f:
    csv_reader = csv.reader(f)
    
    for i, line in enumerate(csv_reader):
        
        if i != 0:
            tag_dictionary[line[0]] = i - 1
            tag_idx_to_tag_list.append(line)
            assert len(tag_idx_to_tag_list) == i
    
def tagID2Vector(tag_list):
    vector = np.zeros(len(tag_dictionary))
    
    for tag in tag_list:
        vector[tag_dictionary[tag]] = 1
    
    return vector

def jobID2Vector(job_id):
    vector = np.zeros(len(tag_dictionary))
    for tag in job_dict[job_id]:
        vector[tag_dictionary[tag]] = 1
    assert np.sum(vector) == len(job_dict[job_id]), f'{np.sum(vector)} != {len(job_dict[job_id])}'
    return vector

def jobID2CompanyScale(job_id):
    vector = np.zeros(len(tag_dictionary))
    for tag in job_dict[job_id]:
        vector[tag_dictionary[tag]] = 1
    assert np.sum(vector) == len(job_dict[job_id]), f'{np.sum(vector)} != {len(job_dict[job_id])}'
    return vector

In [11]:
for job in job_dict:
    assert len(job_dict[job]) == len(set(job_dict[job]))

In [12]:
def collate_function(data_input):

    global user_dict
    
    person_id_list = list()
    job_id_list = list()
    answer = list()

    for data in data_input:
        person_id_list.append(data[0])
        job_id_list.append(data[1])
        answer.append(data[2])
    
    person_tensor = list()
    for person_id in person_id_list:
        user_skill_set = set(user_dict[person_id])
        vector = tagID2Vector(user_skill_set)
        person_tensor.append(vector)
    person_tensor = torch.tensor(person_tensor)
        
    job_tensor = list()
    for job_id in job_id_list:
        vector = jobID2Vector(job_id)
        job_tensor.append(vector)
    job_tensor = torch.tensor(job_tensor)
        
    answer_tensor = torch.tensor(answer)
      
    com_size_tensor = list()
    for job_id in job_id_list:
        com_size_tensor.append(job_id_to_com_size_dict[job_id])
    com_size_tensor = torch.unsqueeze(torch.tensor(com_size_tensor), 1) / 7
    
        
    return person_tensor, job_tensor, answer_tensor, com_size_tensor

In [13]:
dataloader = DataLoader(dataset_train, 
                        batch_size=12, 
                        shuffle=True, 
                        num_workers=4, 
                        collate_fn=collate_function)

In [53]:
count = [0 for i in range(20)]
count_all = [0 for i in range(20)]

for i, (p_tensor, j_tensor, a_tensor, c_tensor) in tqdm(enumerate(dataloader)):
    match_t = torch.matmul(p_tensor, j_tensor.T)
    match_num = [match_t[i][i] for i in range(len(match_t))]
    
    for m, c in zip(match_num, c_tensor):
        count[int(m)] += c.numpy()[0]
        count_all[int(m)] += 1
    

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [54]:
count_all

[1633, 1926, 1325, 711, 264, 92, 41, 6, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [55]:
count

[22, 44, 46, 32, 16, 10, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [56]:
sum(count) / sum(count_all)

0.029