In [None]:
import pandas as pd
import requests
import json
import time

In [None]:
from openai import OpenAI


In [None]:
client = OpenAI(
    api_key="",
    base_url="https://api.kluster.ai/v1"
)

In [None]:
df = pd.read_csv("R1_input_data.csv")
df

In [None]:
# df = df[:4]


In [None]:
df

In [None]:
def reasoning_prompt(job_description):
    return f"""
You are an experienced skill extraction model. Your task is to extract skills from the given job description using a 4-step reasoning process. Be concise and precise in your reasoning.

Step 1: Understand the Role and Context
Describe how the job title fits into the company’s structure or industry. Consider the function, domain, and purpose of the role.

Step 2: Extract Explicit Skills
List all tools, technologies, certifications, and named skills that are clearly and directly mentioned in the job description.
For each skill, provide a short reason explaining why it was extracted.

Step 3: Infer Implicit Skills
Using the context from Step 1 and the explicit skills from Step 2, infer additional skills that are not directly stated but are clearly implied by the responsibilities or expectations.
For each inferred skill, provide a short reason tied to the job description or context.

Step 4: Thinking Log
show your detailed reasoning process for each step above. This will be used to help train smaller models to mimic your thinking.

Output format:
## Thinking
step 1: ...
step 2: ...
step 3: ...

## Skills
skill 1(implicit): reason 1
skill 2(explicit): reason 2
...

Begin analysis using the job description below:

\"\"\"
{job_description}
\"\"\"
"""

In [None]:
import re

def clean_output(output_text):
    """
    Removes the <think>...</think> block from the output string.
    Returns the cleaned JSON string.
    """
    # Regex pattern to match <think> ... </think> including newlines
    think_pattern = re.compile(r"<think>.*?</think>", re.DOTALL)

    # Remove the think block
    cleaned_text = re.sub(think_pattern, "", output_text)

    # Strip whitespace
    return cleaned_text.strip()

In [None]:
import os
import time
import re

In [None]:
output_csv = "reasoning_outputs.csv"

start_idx = 592
calls = 0

if not os.path.isfile(output_csv):
    pd.DataFrame(columns=["input", "output"]).to_csv(output_csv, index=False)

for idx in range(start_idx, len(df)):
    job_description = df.loc[idx, "input"]
    prompt = reasoning_prompt(job_description)

    try:
        completion = client.chat.completions.create(
            model="deepseek-ai/DeepSeek-R1",
            max_completion_tokens=4000,
            temperature=0.6,
            top_p=1,
            messages=[
                {
                    "role": "system",
                    "content": "You are an experienced skill extraction model.",
                },
                {"role": "user", "content": prompt},
            ],
        )

        raw_output = completion.choices[0].message.content
        cleaned_output = clean_output(raw_output)

        current_row = pd.DataFrame({
            "input": [job_description],
            "output": [cleaned_output]
        })
        current_row.to_csv(output_csv, mode='a', index=False, header=False)

        print(f"Saved output for job description {idx+1}")

        if (idx + 1) % 25 == 0:
            checkpoint_df = pd.read_csv(output_csv)
            checkpoint_filename = f"checkpoint_{idx + 1}.csv"
            checkpoint_df.to_csv(checkpoint_filename, index=False)
            print(f"Checkpoint saved: {checkpoint_filename}")
            print("Sleeping for 60s to respect rate limits")
            time.sleep(60)
    except Exception as e:
        print(f"Error at index {idx}: {e}")


In [None]:
results = pd.read_csv("reasoning_outputs.csv")

In [None]:
results