In [33]:
import pandas as pd
import re

# 파일 경로
file_path1 = '/Users/jhkim97/Downloads/translated_category_event_id.csv'
file_path2 = '/Users/jhkim97/Downloads/job_translated.csv'

# CSV 파일 읽기
df1 = pd.read_csv(file_path1)
df2 = pd.read_csv(file_path2)

combined_df = pd.concat([df1, df2], axis=1)

# 합친 데이터프레임을 새 CSV 파일로 저장
combined_df.to_csv('/Users/jhkim97/Downloads/job_final.csv', index=False)




# CSV 파일 경로 지정
csv_file_path = '/Users/jhkim97/Downloads/job_final.csv'

# CSV 파일 읽어오기
df = pd.read_csv(csv_file_path)

# 정제된 데이터를 저장할 새로운 DataFrame 생성
cleaned_df = df.copy()

# 정규 표현식을 사용하여 특수문자 제거
def clean_text(text):
    if isinstance(text, str):
        # 영어, 숫자, 한글, 공백을 제외한 모든 문자를 제거
        cleaned_text = re.sub(r'[^a-zA-Z0-9가-힣\s]', '', text)
        return cleaned_text
    else:
        # 문자열이 아닌 경우 그대로 반환
        return text

# 모든 셀에 대해 특수문자 제거 함수를 적용
for column in cleaned_df.columns:
    cleaned_df[column] = cleaned_df[column].apply(clean_text)


cleaned_df

Unnamed: 0,event_id,category,preferred,requirement,tasks
0,70,Marketing Advertising,If I find a trend that I didnt know I cant sl...,Content marketing and planning experience,Content marketing work
1,71,Sales,Who has excellent communication skills,More than 2 years of ITSI sales experience an...,Technology Sales for Atlasian Products
2,72,Sales,Has a strong understanding of WAS APMEUMMPM s...,B2B or B2G solution sales or IT solution pres...,It acts as an account manager for public and ...
3,73,Management Business,Those who have effective communication skills...,Those who have at least five years of busines...,Develop a growth strategy market research com...
4,74,Sales,I wish he were like this,Im looking for someone like this,Lets do this together
...,...,...,...,...,...
94,164,Sales,Experienced digital online marketing person,Academic background College graduation fourye...,overseas salesmarketing
95,165,Sales,,More than 3 years of experience in sales pref...,Find new customers
96,166,Management Business,1 a person with foreign language English Japan...,1 IT business and solution strategic planning ...,1 Ecommerce solution strategy and business pla...
97,167,Management Business,If you have experience working on publicpriva...,More than 34 years of relevant work experience,Planning and drafting proposals for governmen...


In [40]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine
import numpy as np

top_10_similarities = []

# job_output.csv 파일에서 데이터 읽어오기
data = cleaned_df
data['preferred'].fillna("hello", inplace=True)

# IT 포트폴리오 데이터 읽어오기
it_portfolio_df = pd.read_csv('/Users/jhkim97/Downloads/userid_1_filtered_correct_data.csv')

# 모델 및 토크나이저 불러오기
model_name = "jjzha/jobspanbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# IT 포트폴리오 데이터를 기반으로 유사도 측정
for index, row in it_portfolio_df.iterrows():
    role_text = row['Role']
    project_title_text = row['ProjectTitle']
    technology_text = row['Technology']

    # IT 포트폴리오 데이터를 하나의 포트폴리오 텍스트로 합치기
    it_portfolio_text = f"{role_text}. {project_title_text}. {technology_text}"

    # 사용자 포트폴리오 텍스트 토큰화 및 인덱싱
    user_inputs = tokenizer(it_portfolio_text, return_tensors="pt", padding=True, truncation=True)

    # job_output.csv 파일의 데이터를 처리하고 모델에 입력으로 전달하고 합치기
    combined_embeddings = []

    for idx, job_row in data.iterrows():
        requirements_text = job_row['requirement']
        preferred_text = job_row['preferred']
        task_text = job_row['tasks']
        category_text = job_row['category']
        
        # category_text가 단일 객체인 경우
        category_text = str(category_text)


        # requirements 텍스트 토큰화 및 인덱싱
        requirements_inputs = tokenizer(requirements_text, return_tensors="pt", padding=True, truncation=True)
        
        # preferred 텍스트 토큰화 및 인덱싱
        preferred_inputs = tokenizer(preferred_text, return_tensors="pt", padding=True, truncation=True)

        # task 텍스트 토큰화 및 인덱싱
        task_inputs = tokenizer(task_text, return_tensors="pt", padding=True, truncation=True)
        
        # category 텍스트 토큰화 및 인덱싱
        category_inputs = tokenizer(category_text, return_tensors="pt", padding=True, truncation=True)

        # requirements, preferred 및 task 텐서를 모델에 전달하여 임베딩 얻기
        requirements_outputs = model(**requirements_inputs)
        preferred_outputs = model(**preferred_inputs)
        task_outputs = model(**task_inputs)
        category_outputs = model(**category_inputs)

        # requirements, preferred 및 task 텍스트의 임베딩을 합치기
        combined_requirements_embedding = requirements_outputs.last_hidden_state[:, 0, :]
        combined_preferred_embedding = preferred_outputs.last_hidden_state[:, 0, :]
        combined_task_embedding = task_outputs.last_hidden_state[:, 0, :]
        combined_category_embedding = category_outputs.last_hidden_state[:, 0, :]

        # 가중치를 적용하여 임베딩을 결합
        weighted_combined_embedding = (0.4 * combined_requirements_embedding + 0.3 * combined_preferred_embedding + 0.6 * combined_task_embedding+ 0.7 * combined_category_embedding) / 5

        combined_embeddings.append(weighted_combined_embedding)

    # IT 포트폴리오 텍스트의 임베딩 얻기
    it_portfolio_embedding = model(**user_inputs).last_hidden_state[:, 0, :]

    # 각 요구사항, 선호사항 및 task와 IT 포트폴리오 간의 가중치 적용된 코사인 유사도 계산
    similarities = [1 - cosine(it_portfolio_embedding.detach().numpy().flatten(), weighted_combined_embedding.detach().numpy().flatten()) for weighted_combined_embedding in combined_embeddings]

    # 유사도를 기준으로 상위 10개 직무를 저장
    top_10_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:10]
    top_10_jobs = [(job_index, similarities[job_index]) for job_index in top_10_indices]
    top_10_similarities.append((it_portfolio_text, top_10_jobs))

    # 상위 10개 유사도를 출력
    for idx, (portfolio_text, top_10_jobs) in enumerate(top_10_similarities):
        print(f"IT Portfolio {idx + 1}:")
    for job_index, similarity in top_10_jobs:
        print(f"  - Job {job_index + 70}, Similarity: {similarity:.4f}")

    # 각 요구사항, 선호사항 및 task에 대한 유사도 출력
    #for idx, similarity in enumerate(similarities):
    #    similarity_value = similarity.item()
    #    print(f"IT Portfolio {index + 1}, Job {idx + 1}: Similarity: {similarity_value:.4f}")


Some weights of the model checkpoint at jjzha/jobspanbert-base-cased were not used when initializing BertModel: ['cls.span_predictions.query_end_transform.dense.bias', 'cls.span_predictions.start_transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.span_predictions.end_classifier', 'cls.span_predictions.start_transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.span_predictions.end_transform.LayerNorm.weight', 'cls.span_predictions.end_transform.LayerNorm.bias', 'cls.span_predictions.end_transform.dense.weight', 'cls.span_predictions.query_end_transform.LayerNorm.weight', 'cls.span_predictions.query_start_transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.span_predictions.query_end_transform.dense.weight', 'cls.span_predictions.query_start_transform.dense.weight', 'cls.span_predictions.start_transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.span_predictions.start_classifier', 'cls.predictions.transform.dense.weight', 'cls.s

IT Portfolio 1:
  - Job 98, Similarity: 0.7451
  - Job 125, Similarity: 0.7379
  - Job 86, Similarity: 0.7350
  - Job 78, Similarity: 0.7327
  - Job 95, Similarity: 0.7316
  - Job 154, Similarity: 0.7303
  - Job 138, Similarity: 0.7302
  - Job 101, Similarity: 0.7276
  - Job 133, Similarity: 0.7271
  - Job 139, Similarity: 0.7252
