In [None]:
import requests
import json
from pprint import pp
# Import necessary libraries
## 設定 OpenAI API Key 變數
from dotenv import load_dotenv
import os
from tqdm import tqdm
import pandas as pd

# Load the environment variables from .env file
load_dotenv()

# Access the API key
openai_api_key = os.getenv('OPENAI_API_KEY')

In [13]:
# 定義模型
def get_embeddings(text, dimensions = 256, model="text-embedding-3-small"):
  url = "https://api.openai.com/v1/embeddings"
  payload = { 
    "input": text, 
    "model": model, 
    "dimensions": dimensions 
    }
  headers = { 
    "Authorization": f'Bearer {openai_api_key}', 
    "Content-Type": "application/json" 
    }
  response = requests.post(url, headers=headers, data=json.dumps(payload))
  
  if response.status_code == 200 :
    return json.loads(response.text)["data"][0]["embedding"]
  else :
    print(f"錯誤（status {response.status_code}）: {response.text}")
    return None

In [11]:
# 載入 CSV
df = pd.read_csv(r"C:\Users\user\Desktop\期末專題\prompt提取技能\104_skills_1140721.csv")
df = df .drop(columns="工作技能")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 65 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   job_id  5 non-null      object 
 1   更新日期    5 non-null      object 
 2   查詢職類    5 non-null      int64  
 3   查詢關鍵字   5 non-null      object 
 4   職務類別    5 non-null      object 
 5   職缺名稱    5 non-null      object 
 6   公司名稱    5 non-null      object 
 7   公司連結    5 non-null      object 
 8   產業別     5 non-null      object 
 9   上班地點    5 non-null      object 
 10  地區      5 non-null      object 
 11  城市      5 non-null      object 
 12  國家      5 non-null      object 
 13  薪資      5 non-null      object 
 14  薪資下限    5 non-null      int64  
 15  薪資上限    5 non-null      int64  
 16  職缺描述    5 non-null      object 
 17  職務需求    0 non-null      float64
 18  工作經歷要求  5 non-null      object 
 19  學歷要求    5 non-null      object 
 20  科系要求    1 non-null      object 
 21  擅長工具    4 non-null      object 
 22  中文聽力  

In [14]:
skill_cols = [col for col in df.columns if "技能" in col and "向量" not in col]

for col in tqdm(skill_cols):
    print(f"▶️ 處理欄位：{col}")
    vec_col = f"{col}向量"  # 產生對應向量欄位名稱
    df[vec_col] = df[col].fillna("").apply(lambda x: get_embeddings(x.strip()) if x.strip() else None)

  0%|          | 0/20 [00:00<?, ?it/s]

▶️ 處理欄位：技能1


  5%|▌         | 1/20 [00:04<01:32,  4.89s/it]

▶️ 處理欄位：技能2


 10%|█         | 2/20 [00:12<01:51,  6.21s/it]

▶️ 處理欄位：技能3


 15%|█▌        | 3/20 [00:15<01:21,  4.82s/it]

▶️ 處理欄位：技能4


 20%|██        | 4/20 [00:19<01:11,  4.47s/it]

▶️ 處理欄位：技能5


 25%|██▌       | 5/20 [00:23<01:04,  4.31s/it]

▶️ 處理欄位：技能6


 30%|███       | 6/20 [00:25<00:51,  3.71s/it]

▶️ 處理欄位：技能7


 35%|███▌      | 7/20 [00:29<00:47,  3.64s/it]

▶️ 處理欄位：技能8


 40%|████      | 8/20 [00:32<00:42,  3.51s/it]

▶️ 處理欄位：技能9


 45%|████▌     | 9/20 [00:34<00:33,  3.06s/it]

▶️ 處理欄位：技能10


 50%|█████     | 10/20 [00:37<00:28,  2.89s/it]

▶️ 處理欄位：技能11


 55%|█████▌    | 11/20 [00:38<00:22,  2.47s/it]

▶️ 處理欄位：技能12


 60%|██████    | 12/20 [00:40<00:19,  2.43s/it]

▶️ 處理欄位：技能13


 65%|██████▌   | 13/20 [00:41<00:13,  1.92s/it]

▶️ 處理欄位：技能14


 70%|███████   | 14/20 [00:42<00:09,  1.57s/it]

▶️ 處理欄位：技能15


 75%|███████▌  | 15/20 [00:44<00:08,  1.61s/it]

▶️ 處理欄位：技能16


 80%|████████  | 16/20 [00:44<00:04,  1.25s/it]

▶️ 處理欄位：技能17


 85%|████████▌ | 17/20 [00:44<00:03,  1.01s/it]

▶️ 處理欄位：技能18


 90%|█████████ | 18/20 [00:45<00:01,  1.16it/s]

▶️ 處理欄位：技能19


 95%|█████████▌| 19/20 [00:45<00:00,  1.33it/s]

▶️ 處理欄位：技能20


100%|██████████| 20/20 [00:46<00:00,  2.32s/it]


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 85 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   job_id  5 non-null      object 
 1   更新日期    5 non-null      object 
 2   查詢職類    5 non-null      int64  
 3   查詢關鍵字   5 non-null      object 
 4   職務類別    5 non-null      object 
 5   職缺名稱    5 non-null      object 
 6   公司名稱    5 non-null      object 
 7   公司連結    5 non-null      object 
 8   產業別     5 non-null      object 
 9   上班地點    5 non-null      object 
 10  地區      5 non-null      object 
 11  城市      5 non-null      object 
 12  國家      5 non-null      object 
 13  薪資      5 non-null      object 
 14  薪資下限    5 non-null      int64  
 15  薪資上限    5 non-null      int64  
 16  職缺描述    5 non-null      object 
 17  職務需求    0 non-null      float64
 18  工作經歷要求  5 non-null      object 
 19  學歷要求    5 non-null      object 
 20  科系要求    1 non-null      object 
 21  擅長工具    4 non-null      object 
 22  中文聽力  

In [22]:

skill_vector_cols = [col for col in df.columns if col.startswith("技能") and col.endswith("向量")]

# 對每門課程組成清單
def collect_course_skill_vectors(row):
    skill_vectors = []
    for vec_col in skill_vector_cols:
        skill_num = vec_col.replace("向量", "")
        skill_name = row.get(skill_num)
        vec = row.get(vec_col)
        if isinstance(vec, list) and skill_name:  # 確保是有效向量
            skill_vectors.append({"技能名": skill_name, "向量": vec})
    return skill_vectors

df["職缺技能向量清單"] = df.apply(collect_course_skill_vectors, axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 86 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   job_id    5 non-null      object 
 1   更新日期      5 non-null      object 
 2   查詢職類      5 non-null      int64  
 3   查詢關鍵字     5 non-null      object 
 4   職務類別      5 non-null      object 
 5   職缺名稱      5 non-null      object 
 6   公司名稱      5 non-null      object 
 7   公司連結      5 non-null      object 
 8   產業別       5 non-null      object 
 9   上班地點      5 non-null      object 
 10  地區        5 non-null      object 
 11  城市        5 non-null      object 
 12  國家        5 non-null      object 
 13  薪資        5 non-null      object 
 14  薪資下限      5 non-null      int64  
 15  薪資上限      5 non-null      int64  
 16  職缺描述      5 non-null      object 
 17  職務需求      0 non-null      float64
 18  工作經歷要求    5 non-null      object 
 19  學歷要求      5 non-null      object 
 20  科系要求      1 non-null      object 
 2

In [24]:
df.to_csv("104_skills_embedding_1140721.csv", index=False, encoding="utf-8-sig")