In [2]:
import requests
import json
from pprint import pp
# Import necessary libraries
## 設定 OpenAI API Key 變數
from dotenv import load_dotenv
import os
from tqdm import tqdm
import pandas as pd

# Load the environment variables from .env file
load_dotenv()

# Access the API key
openai_api_key = os.getenv('OPENAI_API_KEY')

In [3]:
# 定義模型
def get_embeddings(text, dimensions = 256, model="text-embedding-3-small"):
  url = "https://api.openai.com/v1/embeddings"
  payload = { 
    "input": text, 
    "model": model, 
    "dimensions": dimensions 
    }
  headers = { 
    "Authorization": f'Bearer {openai_api_key}', 
    "Content-Type": "application/json" 
    }
  response = requests.post(url, headers=headers, data=json.dumps(payload))
  
  if response.status_code == 200 :
    return json.loads(response.text)["data"][0]["embedding"]
  else :
    print(f"錯誤（status {response.status_code}）: {response.text}")
    return None

In [10]:
# 載入 CSV
df = pd.read_csv(r"C:\Users\user\Desktop\期末專題\prompt提取技能\coursera_skills_1140721.csv")
# df = df .drop(columns="技能")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 56 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   課程名稱      100 non-null    object
 1   評分        68 non-null     object
 2   評論數       68 non-null     object
 3   Metadata  100 non-null    object
 4   課程網址      100 non-null    object
 5   課程        72 non-null     object
 6   技能        98 non-null     object
 7   課程資訊      89 non-null     object
 8   師資        92 non-null     object
 9   開課時間      89 non-null     object
 10  建議學習時間    94 non-null     object
 11  學習時長      76 non-null     object
 12  課程相關總描述   100 non-null    object
 13  技能1       100 non-null    object
 14  技能2       99 non-null     object
 15  技能3       99 non-null     object
 16  技能4       99 non-null     object
 17  技能5       99 non-null     object
 18  技能6       96 non-null     object
 19  技能7       91 non-null     object
 20  技能8       87 non-null     object
 21  技能9       84 non-

In [11]:
skill_cols = [col for col in df.columns if "技能" in col and "向量" not in col]

for col in tqdm(skill_cols):
    print(f"▶️ 處理欄位：{col}")
    vec_col = f"{col}向量"  # 產生對應向量欄位名稱
    df[vec_col] = df[col].fillna("").apply(lambda x: get_embeddings(x.strip()) if x.strip() else None)

  0%|          | 0/44 [00:00<?, ?it/s]

▶️ 處理欄位：技能


  2%|▏         | 1/44 [00:55<40:07, 55.99s/it]

▶️ 處理欄位：技能1


  5%|▍         | 2/44 [01:41<35:03, 50.07s/it]

▶️ 處理欄位：技能2


  7%|▋         | 3/44 [02:34<34:56, 51.14s/it]

▶️ 處理欄位：技能3


  9%|▉         | 4/44 [03:21<33:06, 49.67s/it]

▶️ 處理欄位：技能4


 11%|█▏        | 5/44 [04:10<32:07, 49.41s/it]

▶️ 處理欄位：技能5


 14%|█▎        | 6/44 [04:57<30:44, 48.55s/it]

▶️ 處理欄位：技能6


 16%|█▌        | 7/44 [05:43<29:24, 47.68s/it]

▶️ 處理欄位：技能7


 18%|█▊        | 8/44 [06:36<29:39, 49.42s/it]

▶️ 處理欄位：技能8


 20%|██        | 9/44 [07:23<28:16, 48.48s/it]

▶️ 處理欄位：技能9


 23%|██▎       | 10/44 [08:05<26:21, 46.51s/it]

▶️ 處理欄位：技能10


 25%|██▌       | 11/44 [10:59<47:04, 85.60s/it]

▶️ 處理欄位：技能11


 27%|██▋       | 12/44 [11:41<38:35, 72.35s/it]

▶️ 處理欄位：技能12


 30%|██▉       | 13/44 [12:16<31:29, 60.95s/it]

▶️ 處理欄位：技能13


 32%|███▏      | 14/44 [12:42<25:16, 50.54s/it]

▶️ 處理欄位：技能14


 34%|███▍      | 15/44 [13:06<20:33, 42.55s/it]

▶️ 處理欄位：技能15


 36%|███▋      | 16/44 [13:25<16:32, 35.43s/it]

▶️ 處理欄位：技能16


 39%|███▊      | 17/44 [13:38<12:50, 28.55s/it]

▶️ 處理欄位：技能17


 41%|████      | 18/44 [13:49<10:11, 23.51s/it]

▶️ 處理欄位：技能18


 43%|████▎     | 19/44 [14:01<08:21, 20.06s/it]

▶️ 處理欄位：技能19


 45%|████▌     | 20/44 [14:11<06:46, 16.93s/it]

▶️ 處理欄位：技能20


 48%|████▊     | 21/44 [14:19<05:24, 14.12s/it]

▶️ 處理欄位：技能21


 50%|█████     | 22/44 [14:25<04:17, 11.69s/it]

▶️ 處理欄位：技能22


 52%|█████▏    | 23/44 [14:30<03:28,  9.94s/it]

▶️ 處理欄位：技能23


 55%|█████▍    | 24/44 [14:37<02:56,  8.81s/it]

▶️ 處理欄位：技能24


 57%|█████▋    | 25/44 [14:41<02:24,  7.62s/it]

▶️ 處理欄位：技能25


 59%|█████▉    | 26/44 [14:49<02:14,  7.49s/it]

▶️ 處理欄位：技能26


 61%|██████▏   | 27/44 [14:53<01:51,  6.56s/it]

▶️ 處理欄位：技能27


 64%|██████▎   | 28/44 [14:57<01:32,  5.80s/it]

▶️ 處理欄位：技能28


 66%|██████▌   | 29/44 [15:02<01:21,  5.46s/it]

▶️ 處理欄位：技能29


 68%|██████▊   | 30/44 [15:08<01:18,  5.64s/it]

▶️ 處理欄位：技能30


 70%|███████   | 31/44 [15:13<01:10,  5.40s/it]

▶️ 處理欄位：技能31


 73%|███████▎  | 32/44 [15:15<00:55,  4.59s/it]

▶️ 處理欄位：技能32


 75%|███████▌  | 33/44 [15:18<00:44,  4.00s/it]

▶️ 處理欄位：技能33


 77%|███████▋  | 34/44 [15:20<00:34,  3.47s/it]

▶️ 處理欄位：技能34


 80%|███████▉  | 35/44 [15:22<00:25,  2.88s/it]

▶️ 處理欄位：技能35


 82%|████████▏ | 36/44 [15:23<00:19,  2.50s/it]

▶️ 處理欄位：技能36


 84%|████████▍ | 37/44 [15:25<00:15,  2.24s/it]

▶️ 處理欄位：技能37


 86%|████████▋ | 38/44 [15:27<00:12,  2.14s/it]

▶️ 處理欄位：技能38


 89%|████████▊ | 39/44 [15:32<00:14,  2.95s/it]

▶️ 處理欄位：技能39


 91%|█████████ | 40/44 [15:33<00:09,  2.32s/it]

▶️ 處理欄位：技能40


 93%|█████████▎| 41/44 [15:33<00:05,  1.84s/it]

▶️ 處理欄位：技能41


 95%|█████████▌| 42/44 [15:35<00:03,  1.70s/it]

▶️ 處理欄位：技能42


 98%|█████████▊| 43/44 [15:35<00:01,  1.40s/it]

▶️ 處理欄位：技能43


100%|██████████| 44/44 [15:36<00:00, 21.28s/it]


In [13]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 100 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   課程名稱      100 non-null    object
 1   評分        68 non-null     object
 2   評論數       68 non-null     object
 3   Metadata  100 non-null    object
 4   課程網址      100 non-null    object
 5   課程        72 non-null     object
 6   技能        98 non-null     object
 7   課程資訊      89 non-null     object
 8   師資        92 non-null     object
 9   開課時間      89 non-null     object
 10  建議學習時間    94 non-null     object
 11  學習時長      76 non-null     object
 12  課程相關總描述   100 non-null    object
 13  技能1       100 non-null    object
 14  技能2       99 non-null     object
 15  技能3       99 non-null     object
 16  技能4       99 non-null     object
 17  技能5       99 non-null     object
 18  技能6       96 non-null     object
 19  技能7       91 non-null     object
 20  技能8       87 non-null     object
 21  技能9       84 non

In [None]:
# 將課程向量做合集

skill_vector_cols = [col for col in df.columns if col.startswith("技能") and col.endswith("向量")]

# 對每門課程組成清單
def collect_course_skill_vectors(row):
    skill_vectors = []
    for vec_col in skill_vector_cols:
        skill_num = vec_col.replace("向量", "")
        skill_name = row.get(skill_num)
        vec = row.get(vec_col)
        if isinstance(vec, list) and skill_name:  # 確保是有效向量
            skill_vectors.append({"技能名": skill_name, "向量": vec})
    return skill_vectors

df["課程技能向量清單"] = df.apply(collect_course_skill_vectors, axis=1)

['課程名稱', '評分', '評論數', 'Metadata', '課程網址', '課程', '技能', '課程資訊', '師資', '開課時間', '建議學習時間', '學習時長', '課程相關總描述', '技能1', '技能2', '技能3', '技能4', '技能5', '技能6', '技能7', '技能8', '技能9', '技能10', '技能11', '技能12', '技能13', '技能14', '技能15', '技能16', '技能17', '技能18', '技能19', '技能20', '技能21', '技能22', '技能23', '技能24', '技能25', '技能26', '技能27', '技能28', '技能29', '技能30', '技能31', '技能32', '技能33', '技能34', '技能35', '技能36', '技能37', '技能38', '技能39', '技能40', '技能41', '技能42', '技能43', '技能向量', '技能1向量', '技能2向量', '技能3向量', '技能4向量', '技能5向量', '技能6向量', '技能7向量', '技能8向量', '技能9向量', '技能10向量', '技能11向量', '技能12向量', '技能13向量', '技能14向量', '技能15向量', '技能16向量', '技能17向量', '技能18向量', '技能19向量', '技能20向量', '技能21向量', '技能22向量', '技能23向量', '技能24向量', '技能25向量', '技能26向量', '技能27向量', '技能28向量', '技能29向量', '技能30向量', '技能31向量', '技能32向量', '技能33向量', '技能34向量', '技能35向量', '技能36向量', '技能37向量', '技能38向量', '技能39向量', '技能40向量', '技能41向量', '技能42向量', '技能43向量', '課程技能向量清單']


In [21]:
df.to_csv("coursera_skills_embedding_1140721.csv", index=False, encoding="utf-8-sig")

In [15]:
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
def search_similar(df, query, n=3):
    query_embedding = get_embeddings(query)
    df["similarity"] = df['embeddings'].apply(lambda x: cosine_similarity(x, query_embedding))

    results = (
        df.sort_values("similarity", ascending=False)
        .head(n)
    )

    return results

# 根據用戶問題，找最相似的書
results = search_similar(df, "怎樣用 Python 做資料分析", n=5)
results
