In [None]:
# Upgrade pip and install the dev build of LAiSER
!pip install -U pip -q
!pip install dev-laiser -q          

# Verify that Torch sees the GPU
import torch, platform, subprocess, os, sys
print("torch", torch.__version__, "| CUDA available:", torch.cuda.is_available())
!nvidia-smi -L || echo "⚠️  No GPU detected – Runtime ▸ Change runtime type ▸ GPU"


torch 2.6.0+cu124 | CUDA available: False
/bin/bash: line 1: nvidia-smi: command not found
⚠️  No GPU detected – Runtime ▸ Change runtime type ▸ GPU


In [None]:
from google.colab import files
uploaded = files.upload()           # If you're reading this you'll need to upload and choose DAFECD‑31‑Oct‑24.csv & DAFOCD‑31‑Oct‑24.csv in your google colab session
print("Uploaded:", list(uploaded.keys()))


Saving afsc_clean.csv to afsc_clean.csv
Uploaded: ['afsc_clean.csv']


In [9]:
import os, glob, re, pandas as pd

# Ensure filenames end with ".csv"
for base in ["DAFOCD-31-Oct-24", "DAFECD-31-Oct-24"]:
    if os.path.exists(base) and not base.endswith(".csv"):
        os.rename(base, base + ".csv")

print("Current CSVs in /content:")
!ls -1 *.csv


Current CSVs in /content:
afsc_clean.csv
DAFECD-31-Oct-24.csv
DAFOCD-31-Oct-24.csv


In [10]:
def parse_afsc_file(path):
    df = pd.read_csv(path, header=None, dtype=str).fillna("")
    lines = df[0].tolist()

    pat = re.compile(r"^(?:CEM Code|AFSC)\s", re.I)
    chunks, code, buf = {}, None, []

    for line in lines:
        s = line.strip()
        if pat.match(s):            # new AFSC section
            if code:
                chunks[code] = "\n".join(buf)
            code, buf = s, []
        else:
            buf.append(s)
    if code:
        chunks[code] = "\n".join(buf)

    return pd.DataFrame({"AFSC_Code": chunks.keys(),
                         "description": chunks.values()})

# Parse every uploaded CSV that starts with DAF/DAO
dfs = [parse_afsc_file(p) for p in glob.glob("DA*OCD*.csv")]
df_afsc = pd.concat(dfs, ignore_index=True)

print(f"Parsed rows: {len(df_afsc)}")
df_afsc.head()


Parsed rows: 406


Unnamed: 0,AFSC_Code,description
0,AFSC 10C0,OPERATIONS COMMANDER\n(Changed 31 Oct 08)\n1. ...
1,"AFSC 10C0, completion of a current T5 Investig...","\n\n\n\n32\nDAFOCD, 31 Oct 24\nPILOT UTILIZATI..."
2,"AFSC 11B4*, Staff",
3,"AFSC 11B3*, Aircraft Commander",
4,"AFSC 11B2*, Qualified Pilot/Copilot",


In [None]:
# Modified cell 5 to force CPU mode explicitly
from laiser.skill_extractor import Skill_Extractor
se = Skill_Extractor(
    AI_MODEL_ID=None,        # default model
    HF_TOKEN="",  # Add your hugging face token here if you have one, I deleted mine for security reasons
    use_gpu=False            # Explicitly force CPU mode
)
results = se.extractor(
    data=df_afsc,
    id_column="AFSC_Code",
    text_columns=["description"],
    input_type="job_desc",
    batch_size=4             # reduced batch size for safety
)
print("✅ Extraction finished – total rows:", len(results))
display(results.head())

Found 'en_core_web_lg' model. Loading...
GPU is not available. Using CPU for SkillNer model initialization...
loading full_matcher ...
loading abv_matcher ...
loading full_uni_matcher ...
loading low_form_matcher ...
loading token_matcher ...


  input_text = input_text[text_columns][0]
  extracted = extracted._append(aligned_skills, ignore_index=True)
  input_text = input_text[text_columns][0]
  vec_similarity = token1.similarity(token2)
  input_text = input_text[text_columns][0]
  input_text = input_text[text_columns][0]
  input_text = input_text[text_columns][0]
  input_text = input_text[text_columns][0]
  input_text = input_text[text_columns][0]
  vec_similarity = token1.similarity(token2)
  input_text = input_text[text_columns][0]
  input_text = input_text[text_columns][0]
  input_text = input_text[text_columns][0]
  vec_similarity = token1.similarity(token2)
  input_text = input_text[text_columns][0]
  input_text = input_text[text_columns][0]
  input_text = input_text[text_columns][0]
  input_text = input_text[text_columns][0]
  vec_similarity = token1.similarity(token2)
  input_text = input_text[text_columns][0]
  input_text = input_text[text_columns][0]
  input_text = input_text[text_columns][0]
  input_text = input_t

✅ Extraction finished – total rows: 2352


Unnamed: 0,Research ID,Raw Skill,Skill Tag,Correlation Coefficient
0,AFSC 10C0,management,ESCO.565,0.869502
1,AFSC 10C0,management,ESCO.888,0.875503
2,AFSC 10C0,operations training,OSN.739,0.853683
3,AFSC 10C0,information system,ESCO.265,0.858631
4,AFSC 10C0,information system,ESCO.327,0.86652


In [12]:
results.to_csv("afsc_skills_laiser.csv", index=False)
from google.colab import files
files.download("afsc_skills_laiser.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>