In [None]:
!pip install -q git+https://github.com/huggingface/accelerate.git
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q bitsandbytes
!pip install -q langchain 

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

device = "cuda"  
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", load_in_4bit =True)
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")


In [None]:
import numpy as np  
import pandas as pd  
import json
import os

jobs_posting_file_path = r'/kaggle/input/linkedin-job-postings/job_postings.csv' #https://www.kaggle.com/datasets/arshkon/linkedin-job-postings
df = pd.read_csv(jobs_posting_file_path)

save_filename = "/kaggle/working/data.json"

if os.path.exists(save_filename):
    with open("/kaggle/working/data.json" , "r") as file:
        data = json.load(file)
else:
    data = []

In [None]:
import json 
from tqdm import tqdm   
from langchain.text_splitter import RecursiveCharacterTextSplitter
 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=250)
 
model = model.to("cuda")

for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"): 
    job_id = row['job_id'] 
    if any(item['job_id'] == job_id for item in data):
        continue 
        
    fullDesc = row['description']
    splits = text_splitter.split_text(row['description'])   
    print(job_id)
    
    for index,description in enumerate(splits): 
        
        messages = [
        {"role": "user", "content": f"""{description}\n\nFrom the job advert above, extract all information and additional context that fits under the labels stated below, 
Then output the information in a JSON format using the labels as the key and all revelevant the information in a array of strings.
Inside each key want you to create a string array and extract relevant lines from the text to that key.
Do not create more keys under the listed labels below, for each label all extracted information should be within one single string array.
Labels:
-Required Education
-Required Certification
-Required Qualifications
-Required work experience
-Required Hard Skills
-Required Soft skills
-Benefits 
-Company culture/values.
-Job duties"""
        }]
        inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors='pt'
        )
        tokens = model.generate(
            inputs.to(model.device),
            max_new_tokens=1024,
            temperature=0.2,
            do_sample=True 
        )
        output = tokenizer.decode(tokens[0], skip_special_tokens=False).split('<|assistant|>')[1].split("<|endoftext|>")[0]  

        #print(output) 
        data.append({"job_id" : job_id, "description": description, "output": output})
        if len(data) % 50 == 0: 
            with open(f"data.json", "w") as file:
                json.dump(data, file)
            print(f"Saved {len(data)} items to data_{len(data)}.json")
