In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 120)


In [4]:
DATA_DIR = Path("") 

occupation_df = pd.read_csv(DATA_DIR / "Occupation Data.txt", sep="\t")
skills_df = pd.read_csv(DATA_DIR / "Skills.txt", sep="\t")
knowledge_df = pd.read_csv(DATA_DIR / "Knowledge.txt", sep="\t")
tech_df = pd.read_csv(DATA_DIR / "Technology Skills.txt", sep="\t")


In [5]:
occupation_df.head()


Unnamed: 0,O*NET-SOC Code,Title,Description
0,11-1011.00,Chief Executives,Determine and formulate policies and provide o...
1,11-1011.03,Chief Sustainability Officers,"Communicate and coordinate with management, sh..."
2,11-1021.00,General and Operations Managers,"Plan, direct, or coordinate the operations of ..."
3,11-1031.00,Legislators,"Develop, introduce, or enact laws and statutes..."
4,11-2011.00,Advertising and Promotions Managers,"Plan, direct, or coordinate advertising polici..."


In [6]:
skills_df.head()


Unnamed: 0,O*NET-SOC Code,Element ID,Element Name,Scale ID,Data Value,N,Standard Error,Lower CI Bound,Upper CI Bound,Recommend Suppress,Not Relevant,Date,Domain Source
0,11-1011.00,2.A.1.a,Reading Comprehension,IM,4.12,8,0.125,3.88,4.37,N,,08/2023,Analyst
1,11-1011.00,2.A.1.a,Reading Comprehension,LV,4.62,8,0.183,4.2664,4.9836,N,N,08/2023,Analyst
2,11-1011.00,2.A.1.b,Active Listening,IM,4.0,8,0.0,4.0,4.0,N,,08/2023,Analyst
3,11-1011.00,2.A.1.b,Active Listening,LV,4.75,8,0.1637,4.4292,5.0708,N,N,08/2023,Analyst
4,11-1011.00,2.A.1.c,Writing,IM,4.12,8,0.125,3.88,4.37,N,,08/2023,Analyst


In [7]:
knowledge_df.head()


Unnamed: 0,O*NET-SOC Code,Element ID,Element Name,Scale ID,Data Value,N,Standard Error,Lower CI Bound,Upper CI Bound,Recommend Suppress,Not Relevant,Date,Domain Source
0,11-1011.00,2.C.1.a,Administration and Management,IM,4.78,28.0,0.1102,4.5564,5.0,N,,08/2023,Incumbent
1,11-1011.00,2.C.1.a,Administration and Management,LV,6.5,28.0,0.213,6.0666,6.9409,N,N,08/2023,Incumbent
2,11-1011.00,2.C.1.b,Administrative,IM,2.42,28.0,0.4651,1.4662,3.3749,N,,08/2023,Incumbent
3,11-1011.00,2.C.1.b,Administrative,LV,2.69,28.0,0.8678,0.9078,4.469,N,N,08/2023,Incumbent
4,11-1011.00,2.C.1.c,Economics and Accounting,IM,4.04,28.0,0.348,3.3246,4.7526,N,,08/2023,Incumbent


In [8]:
tech_df.head()


Unnamed: 0,O*NET-SOC Code,Example,Commodity Code,Commodity Title,Hot Technology,In Demand
0,11-1011.00,Adobe Acrobat,43232202,Document management software,Y,N
1,11-1011.00,AdSense Tracker,43232306,Data base user interface and query software,N,N
2,11-1011.00,Atlassian JIRA,43232201,Content workflow software,Y,N
3,11-1011.00,Blackbaud The Raiser's Edge,43232303,Customer relationship management CRM software,N,N
4,11-1011.00,ComputerEase construction accounting software,43231601,Accounting software,N,N


In [9]:
def clean_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace("*", "", regex=False)
        .str.replace(" ", "_")
    )
    return df

occupation_df = clean_columns(occupation_df)
skills_df = clean_columns(skills_df)
knowledge_df = clean_columns(knowledge_df)
tech_df = clean_columns(tech_df)


In [11]:
occupation_df = occupation_df[
    ["onet-soc_code", "title", "description"]
]

skills_df = skills_df[
    ["onet-soc_code", "element_name", "scale_id", "data_value"]
]

knowledge_df = knowledge_df[
    ["onet-soc_code", "element_name", "scale_id", "data_value"]
]

tech_df = tech_df[
    ["onet-soc_code", "example", "commodity_title"]
]


In [12]:
skills_im = skills_df[skills_df["scale_id"] == "IM"]
knowledge_im = knowledge_df[knowledge_df["scale_id"] == "IM"]


In [13]:
TOP_N = 8

top_skills = (
    skills_im
    .sort_values("data_value", ascending=False)
    .groupby("onet-soc_code")
    .head(TOP_N)
)

top_knowledge = (
    knowledge_im
    .sort_values("data_value", ascending=False)
    .groupby("onet-soc_code")
    .head(TOP_N)
)


In [15]:
skills_text = (
    top_skills
    .groupby("onet-soc_code")["element_name"]
    .apply(lambda x: ", ".join(x))
    .reset_index(name="skills_text")
)

knowledge_text = (
    top_knowledge
    .groupby("onet-soc_code")["element_name"]
    .apply(lambda x: ", ".join(x))
    .reset_index(name="knowledge_text")
)

tech_text = (
    tech_df
    .groupby("onet-soc_code")["example"]
    .apply(lambda x: ", ".join(x.dropna().unique()))
    .reset_index(name="tech_text")
)


In [16]:
final_df = (
    occupation_df
    .merge(skills_text, on="onet-soc_code", how="left")
    .merge(knowledge_text, on="onet-soc_code", how="left")
    .merge(tech_text, on="onet-soc_code", how="left")
)


In [17]:
def build_text(row):
    return f"""
    {row['description']}
    Key skills include: {row['skills_text']}.
    Knowledge areas include: {row['knowledge_text']}.
    Technologies include: {row['tech_text']}.
    """

final_df["text"] = final_df.apply(build_text, axis=1)


In [18]:
final_df[["title", "text"]].head()


Unnamed: 0,title,text
0,Chief Executives,\n Determine and formulate policies and pro...
1,Chief Sustainability Officers,\n Communicate and coordinate with manageme...
2,General and Operations Managers,"\n Plan, direct, or coordinate the operatio..."
3,Legislators,"\n Develop, introduce, or enact laws and st..."
4,Advertising and Promotions Managers,"\n Plan, direct, or coordinate advertising ..."


In [19]:
# Define mapping rules (extend as needed)
domain_mapping = {
    "Data Scientist": "Data Science",
    "Software Developers": "Software Engineering",
    "Machine Learning Engineers": "AI / ML Engineering",
    "Information Security Analysts": "Cybersecurity",
    "Marketing Managers": "Digital Marketing",
    "UX Designers": "UI/UX Design",
    "Product Managers": "Product Management",
    "Business Analysts": "Business Analytics",
    "DevOps Engineers": "Cloud / DevOps",
}

def map_domain(title):
    for key, val in domain_mapping.items():
        if key.lower() in str(title).lower():
            return val
    return None

final_df["career_domain"] = final_df["title"].apply(map_domain)
final_df = final_df.dropna(subset=["career_domain"])
print(final_df["career_domain"].value_counts())


career_domain
Digital Marketing       1
Cybersecurity           1
Software Engineering    1
Data Science            1
Name: count, dtype: int64


In [21]:
!pip install sentence-transformers scikit-learn




In [23]:
pip install tf-keras


Collecting tf-keras
  Downloading tf_keras-2.20.1-py3-none-any.whl.metadata (1.8 kB)
Downloading tf_keras-2.20.1-py3-none-any.whl (1.7 MB)
   ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
   ------------------------------------- -- 1.6/1.7 MB 10.9 MB/s eta 0:00:01
   ---------------------------------------- 1.7/1.7 MB 9.6 MB/s eta 0:00:00
Installing collected packages: tf-keras
Successfully installed tf-keras-2.20.1
Note: you may need to restart the kernel to use updated packages.


In [27]:
import sys, pkgutil, importlib, traceback
from importlib import metadata

def get_version(pkg):
    try:
        return metadata.version(pkg)
    except Exception:
        try:
            m = importlib.import_module(pkg)
            return getattr(m, "__version__", str(type(m)))
        except Exception:
            return None

pkgs = ["torch", "tensorflow", "keras", "tf_keras", "transformers", "sentence_transformers", "sentence-transformers", "huggingface_hub"]
info = {p: get_version(p) for p in pkgs}

print("Python:", sys.version.replace("\n"," "))
print()
print("Package versions (may be None if not installed):")
for k,v in info.items():
    print(f"  {k}: {v}")

for name in ("torch", "tensorflow"):
    try:
        m = importlib.import_module(name)
        if name == "torch":
            try:
                print("Torch device count:", m.cuda.device_count(), "cuda available:", m.cuda.is_available())
            except Exception as e:
                print("Torch import OK; cuda query failed:", e)
        else:
            print("TensorFlow version:", getattr(m, "__version__", "unknown"))
    except Exception as e:
        print(f"Could not import {name}: {e.__class__.__name__}: {e}")


Python: 3.13.5 | packaged by Anaconda, Inc. | (main, Jun 12 2025, 16:37:03) [MSC v.1929 64 bit (AMD64)]

Package versions (may be None if not installed):
  torch: 2.9.1
  tensorflow: 2.20.0
  keras: 3.12.0
  tf_keras: 2.20.1
  transformers: 4.57.6
  sentence_transformers: 5.2.0
  sentence-transformers: 5.2.0
  huggingface_hub: 0.36.0
Torch device count: 0 cuda available: False
TensorFlow version: 2.20.0


In [28]:
from sentence_transformers import SentenceTransformer
import traceback, time
model_name = "all-MiniLM-L6-v2"

try:
    t0 = time.time()
    model = SentenceTransformer(model_name)
    t1 = time.time()
    print(f"Loaded model {model_name} in {t1-t0:.2f}s")
    out = model.encode(["test sentence"], show_progress_bar=False)
    print("Encode test output shape / type:", type(out), getattr(out, "shape", None))
except Exception as e:
    print("Error while loading or encoding with SentenceTransformer:")
    traceback.print_exc()


Loaded model all-MiniLM-L6-v2 in 4.85s
Encode test output shape / type: <class 'numpy.ndarray'> (1, 384)


In [29]:
from math import ceil
import numpy as np
texts = final_df["text"].astype(str).tolist()  
batch_size = 32   

print("Total texts:", len(texts))
try:
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        emb = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
        embeddings.append(emb)
        print(f"Encoded {i}..{i+len(batch)-1} -> shape {emb.shape}")
    embeddings = np.vstack(embeddings)
    print("All embeddings stacked shape:", embeddings.shape)
except Exception as e:
    import traceback
    print("Error during batch encoding:")
    traceback.print_exc()


Total texts: 4
Encoded 0..3 -> shape (4, 384)
All embeddings stacked shape: (4, 384)


In [31]:
from sklearn.preprocessing import LabelEncoder

print("Columns available:", final_df.columns.tolist())

le = LabelEncoder()
y = le.fit_transform(final_df["career_domain"])

print("Encoded classes:", le.classes_)
print("Target array shape:", y.shape)

Columns available: ['onet-soc_code', 'title', 'description', 'skills_text', 'knowledge_text', 'tech_text', 'text', 'career_domain']
Encoded classes: ['Cybersecurity' 'Data Science' 'Digital Marketing' 'Software Engineering']
Target array shape: (4,)


In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression(max_iter=2000, multi_class='multinomial')
lr.fit(embeddings, y)
print(classification_report(y, lr.predict(embeddings), target_names=le.classes_))


                      precision    recall  f1-score   support

       Cybersecurity       1.00      1.00      1.00         1
        Data Science       1.00      1.00      1.00         1
   Digital Marketing       1.00      1.00      1.00         1
Software Engineering       1.00      1.00      1.00         1

            accuracy                           1.00         4
           macro avg       1.00      1.00      1.00         4
        weighted avg       1.00      1.00      1.00         4





In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

models = {
    "LogisticRegression": LogisticRegression(max_iter=2000, multi_class='multinomial'),
    "SVM (RBF Kernel)": SVC(kernel='rbf', probability=True),
    "RandomForest": RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=42)
}

results = []

for name, clf in models.items():
    clf.fit(embeddings, y)
    preds = clf.predict(embeddings)
    acc = accuracy_score(y, preds)
    f1 = f1_score(y, preds, average='macro')
    results.append((name, acc, f1))

import pandas as pd
pd.DataFrame(results, columns=["Model", "Accuracy", "F1_macro"])




Unnamed: 0,Model,Accuracy,F1_macro
0,LogisticRegression,1.0,1.0
1,SVM (RBF Kernel),1.0,1.0
2,RandomForest,1.0,1.0


In [34]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report

ensemble = VotingClassifier(
    estimators=[
        ("lr", LogisticRegression(max_iter=2000, multi_class="multinomial")),
        ("svm", SVC(kernel="rbf", probability=True)),
        ("rf", RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42))
    ],
    voting="soft" 
)

ensemble.fit(embeddings, y)
preds = ensemble.predict(embeddings)

print("Ensemble Performance:\n")
print(classification_report(y, preds, target_names=le.classes_))




Ensemble Performance:

                      precision    recall  f1-score   support

       Cybersecurity       1.00      1.00      1.00         1
        Data Science       1.00      1.00      1.00         1
   Digital Marketing       1.00      1.00      1.00         1
Software Engineering       1.00      1.00      1.00         1

            accuracy                           1.00         4
           macro avg       1.00      1.00      1.00         4
        weighted avg       1.00      1.00      1.00         4



In [35]:
import numpy as np

def predict_top_k(text, k=3):
    emb = model.encode([text], convert_to_numpy=True)
    probs = ensemble.predict_proba(emb)[0]
    top_idx = np.argsort(probs)[-k:][::-1]
    
    results = []
    for i in top_idx:
        results.append({
            "career_domain": le.inverse_transform([i])[0],
            "confidence": round(float(probs[i]), 3)
        })
    
    return results

test_text = "I work with Python, machine learning models, and large datasets."
predict_top_k(test_text)


[{'career_domain': 'Software Engineering', 'confidence': 0.261},
 {'career_domain': 'Digital Marketing', 'confidence': 0.253},
 {'career_domain': 'Cybersecurity', 'confidence': 0.245}]

In [37]:
import joblib

pipeline = {
    "sbert_model_name": "all-MiniLM-L6-v2",
    "ensemble_model": ensemble,
    "label_encoder": le
}

joblib.dump(pipeline, "career_prediction_pipeline.pkl")

print("✅ Pipeline saved as career_prediction_pipeline.pkl")


✅ Pipeline saved as career_prediction_pipeline.pkl


In [38]:
joblib.dump(ensemble, "ensemble_model.pkl")
joblib.dump(le, "label_encoder.pkl")


['label_encoder.pkl']

In [39]:
def predict_career(text, model, label_encoder, embedder, top_k=3):
    emb = embedder.encode([text])
    probs = model.predict_proba(emb)[0]

    top_idx = probs.argsort()[-top_k:][::-1]
    
    return [
        {
            "career_domain": label_encoder.inverse_transform([i])[0],
            "confidence": float(probs[i])
        }
        for i in top_idx
    ]


In [40]:
def clean_text(t):
    return " ".join(t.split())
