## Installing required packages

In [1]:
!pip3 install numpy==1.26.4 pandas==2.2.2 psutil==5.9.8 scikit_learn==1.5.1 skillNer==1.0.3 spacy==3.7.4 transformers==4.41.2 accelerate==0.27.1 bitsandbytes==0.43.1 datasets==2.20.0 huggingface_hub==0.23.4 peft==0.11.1 torch==2.3.1 trl==0.9.4 skillNer==1.0.3



## Import essential packages and libraries

In [2]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
# native packages
import sys
import os

# installed packages
import time
import spacy
import torch
import pandas as pd
from spacy.matcher import PhraseMatcher
from skillNer.general_params import SKILL_DB
from sklearn.metrics.pairwise import cosine_similarity
from skillNer.skill_extractor_class import SkillExtractor
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

## Global variables and Function Definitions from internal packages

#### Utils

- The content in the code cell below are copied from the `../laiser/utils.py` file

In [29]:
#utils
import numpy as np
import psutil
import logging


def cosine_similarity(vec1, vec2):
    """
    Calculates cosine similarity between 2 vectors

    Parameters
    ----------
    vec1, vec2 : numpy array of vectorized text

    Returns
    -------
    numeric value
    """
    product_of_magnitude = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    if product_of_magnitude == 0.0:
        return 0.0
    return np.dot(vec1, vec2) / product_of_magnitude


def get_embedding(nlp, input_text):
    """
    Creates vector embeddings for input text based on nlp object

    Parameters
    ----------
    nlp : object of spacy nlp model
    input_text : text
        Provide text to be vectorized, usually skill, extracted of referenced

    Returns
    -------
    numpy array of vectorized text


    """
    doc = nlp(input_text)
    if len(doc) == 0:
        return np.zeros(300)  # Return zeros for empty texts
    return np.mean([word.vector for word in doc], axis=0)


def log_performance(function_name, start_time, end_time):
    """
    Utility function to log performance in unit of time for a function

    Parameters
    ----------
    function_name : text
        Name of the function
    start_time : time
        execution start time of the function
    end_time : time
        execution end time of the function

    """
    execution_time = end_time - start_time
    process = psutil.Process()
    cpu_percent = process.cpu_percent()
    memory_info = process.memory_info()
    memory_usage = memory_info.rss / (1024 ** 2)  # Convert to MB

    log_message = (
        f"Function: {function_name}\n"
        f"Execution time: {execution_time:.2f} seconds\n"
        f"CPU usage: {cpu_percent:.2f}%\n"
        f"Memory usage: {memory_usage:.2f} MB\n"
        "-------------------------------"
    )
    logging.info(log_message)
    print(log_message)


#### Params
- The content in the code cell below are copied from the `../laiser/params.py` file

In [32]:
import os

# Use a placeholder if __file__ is not available
if '__file__' in locals():
    ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
else:
    ROOT_DIR = os.getcwd()  # Use current working directory as a fallback

INPUT_PATH = os.path.join(ROOT_DIR, 'input')
OUTPUT_PATH = os.path.join(ROOT_DIR, 'output')

# SKILL_DB_PATH = os.path.join(INPUT_PATH, 'combined.csv')
SKILL_DB_PATH = 'https://raw.githubusercontent.com/phanindra-max/LAiSER-datasets/master/combined.csv'

SIMILARITY_THRESHOLD = 0.85

AI_MODEL_ID = 'google/gemma-2b-it'

#### LLM Methods
- The content in the code cell below are copied from the `../laiser/llm_methods.py` file

In [6]:
# llm_methods
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

torch.cuda.empty_cache()

def fetch_model_output(response):
    # Find the content between the model start tag and the last <eos> tag
    pattern = r'<start_of_turn>model\s*<eos>(.*?)<eos>\s*$'
    match = re.search(pattern, response, re.DOTALL)

    if match:
        content = match.group(1).strip()

        # Split the content by lines and filter out empty lines
        lines = [line.strip() for line in content.split('\n') if line.strip()]

        # Extract skills (lines starting with '-')
        skills = [line[1:].strip() for line in lines if line.startswith('-')]

        return skills

def get_completion_batch(queries: list, model, tokenizer, batch_size=2) -> list:
    device = "cuda:0"
    results = []

    prompt_template = """
    <start_of_turn>user
    Name all the skills present in the following description in a single list. Response should be in English and have only the skills, no other information or words. Skills should be keywords, each being no more than 3 words.
    Below text is the Description:

    {query}
    <end_of_turn>\n<start_of_turn>model
    """

    for i in range(0, len(queries), batch_size):
        batch = queries[i:i+batch_size]
        prompts = [prompt_template.format(query=query) for query in batch]

        encodeds = tokenizer(prompts, return_tensors="pt", add_special_tokens=True, padding=True, truncation=True)
        model_inputs = encodeds.to(device)

        with torch.no_grad():
            generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)

        decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)

        for full_output in decoded:
            # Extract only the model's response
            response = full_output.split("<start_of_turn>model<eos>")[-1].strip()
            processed_response = fetch_model_output(response)
            results.append(processed_response)

        # Clear CUDA cache after each batch
        torch.cuda.empty_cache()

        print(f"Processed batch {i//batch_size + 1}/{(len(queries)-1)//batch_size + 1}")

    return results

def get_completion(query: str, model, tokenizer) -> str:
    device = "cuda:0"

    prompt_template = """
    <start_of_turn>user
    Name all the skills present in the following description in a single list. Response should be in English and have only the skills, no other information or words. Skills should be keywords, each being no more than 3 words.
    Below text is the Description:

    {query}
    <end_of_turn>\n<start_of_turn>model
    """
    prompt = prompt_template.format(query=query)

    encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

    model_inputs = encodeds.to(device)

    generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
    response = decoded.strip()
    processed_response = fetch_model_output(response)
    return (processed_response)

## Skill Extractor Class

- Copied from `../laiser/skill_extractor.py` file



In [57]:
import numpy as np
from scipy.spatial.distance import cdist

class Skill_Extractor:
    """
    Class to extract skills from text and align them to existing taxonomy
    ...

    Attributes
    ----------
    client : HuggingFace API client
    nlp : spacy nlp model
        Short description

    Parameters
    ----------


    Methods
    -------
    extract_raw(input_text: text)
        The function extracts skills from text using NER model

    align_skills(raw_skills: list, document_id='0': string):
        This function aligns the skills provided to the desired taxonomy

    extractor(data: pandas dataframe, id_column='Research ID', text_column='Text'):
        Function takes text dataset to extract and aligns skills based on available taxonomies
    ....

    """

    def __init__(self):
        self.model_id = AI_MODEL_ID
        if torch.cuda.is_available():
            print("GPU is available. Using GPU for Fine-tuned Language model initialization.")
            torch.cuda.empty_cache()
            self.bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16
            )
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_id,
                quantization_config=self.bnb_config,
                device_map={"": 0},
                token="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"
            )
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, add_eos_token=True, padding_side='left', token="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")
            self.nlp = spacy.load("en_core_web_lg")
            # expo
            self.skill_db_df = pd.read_csv(SKILL_DB_PATH)
            self.skill_db_embeddings = np.array([get_embedding(self.nlp, label) for label in self.skill_db_df['SkillLabel']])
        else:
            print("GPU is not available. Using CPU for SkillNer model initialization.")
            self.nlp = spacy.load("en_core_web_lg")
            self.ner_extractor = SkillExtractor(self.nlp, SKILL_DB, PhraseMatcher)
        return

    # Declaring a private method for extracting raw skills from input text
    def extract_raw(self, input_text):
        """
        The function extracts skills from text using Fine-Tuned Language Model's API

        Parameters
        ----------
        input_text : text
            Job advertisement / Job Description / Syllabus Description / Course Outcomes etc.

        Returns
        -------
        list: List of extracted skills from text

        Notes
        -----
        More details on which (pre-trained) language model is fine-tuned can be found in llm_methods.py
        The Function is designed only to return list of skills based on prompt passed to OpenAI's Fine-tuned model.

        """

        if torch.cuda.is_available():
            # GPU is available. Using Language model for extraction.
            extracted_skills = get_completion(input_text, self.model, self.tokenizer)
            print("Extracted_skills: ", extracted_skills)
            extracted_skills_set = set(extracted_skills)
            torch.cuda.empty_cache()
        else:
            # GPU is not available. Using SkillNer model for extraction.
            ner_extractor = self.ner_extractor
            extracted_skills_set = set()
            annotations = None
            try:
                annotations = ner_extractor.annotate(input_text)
            except ValueError as e:
                print(f"Skipping example, ValueError encountered: {e}")
            except Exception as e:
                print(f"Skipping example, An unexpected error occurred: {e}")

            for item in annotations['results']['full_matches']:
                extracted_skills_set.add(item['doc_node_value'])

            # get ngram_scored
            for item in annotations['results']['ngram_scored']:
                extracted_skills_set.add(item['doc_node_value'])

        return list(extracted_skills_set)
        # return extracted_skills_set

    def align_skills(self, raw_skills, document_id='0'):
        """
        This function aligns the skills provided to the available taxonomy

        Parameters
        ----------
        raw_skills : list
            Provide list of skill extracted from Job Descriptions / Syllabus.

        Returns
        -------
        list: List of taxonomy skills from text in JSON format
            [
                {
                    "Research ID": text_id,
                    "Skill Name": Raw skill extracted,
                    "Skill Tag": taxonomy skill tag,
                    "Correlation Coefficient": similarity_score
                },
                ...
            ]

        """
        raw_skill_embeddings = np.array([get_embedding(self.nlp, skill) for skill in raw_skills])

        # Calculate cosine similarities in bulk
        similarities = 1 - cdist(raw_skill_embeddings, self.skill_db_embeddings, metric='cosine')

        matches = []
        for i, raw_skill in enumerate(raw_skills):
            skill_matches = np.where(similarities[i] > SIMILARITY_THRESHOLD)[0]
            for match in skill_matches:
                matches.append({
                    "Research ID": document_id,
                    "Raw Skill": raw_skill,
                    "Skill Tag": self.skill_db_df.iloc[match]['SkillTag'],
                    "Correlation Coefficient": similarities[i, match]
                })

        return matches

    def extractor(self, data, id_column='Research ID', text_column='Text'):
        """
        Function takes text dataset to extract and aligns skills based on available taxonomies

        Parameters
        ----------
        data : pandas dataframe
            Dataset containing text id and actual text to extract skills.
        id_column: string
            Name of id column in the dataset. Defaults to 'Research ID'
        text_column: string
            Name of the text column in the dataset. Defaults to 'Text'

        Returns
        -------
        list: List of skill tags and similarity_score for all texts in  from text in JSON format
            [
                {
                    "Research ID": text_id
                    "Skill Name": Raw skill extracted,
                    "Skill Tag": taxonomy skill tag,
                    "Correlation Coefficient": similarity_score
                },
                ...
            ]

        """
        start_time = time.time()
        extracted = pd.DataFrame(columns=['Research ID', 'Raw Skill', 'Skill Tag', 'Correlation Coefficient'])
        for index, row in data.iterrows():
            research_id = row[id_column]
            input_text = row[text_column]
            raw_skills = self.extract_raw(input_text)
            if(len(raw_skills) == 0):
                continue
            else:
              aligned_skills = self.align_skills(raw_skills, research_id)
              extracted = extracted._append(aligned_skills, ignore_index=True)
        end_time = time.time()
        log_performance('extractor', start_time, end_time)
        return extracted

## Using the Skill Extractor

In [58]:
# nlx_sample = pd.read_csv(os.path.join(INPUT_PATH, 'nlx_tx_sample_data_gwu.csv'))
# Import the dataset
nlx_sample = pd.read_csv('https://raw.githubusercontent.com/phanindra-max/LAiSER-datasets/master/nlx_tx_sample_data_gwu.csv')

nlx_sample = nlx_sample[['description', 'job_id']]

In [59]:
nlx_sample

Unnamed: 0,description,job_id
0,Req ID: 29534BR\n\nPOSITION SUMMARY\n\nThis po...,69322097
1,Enters data using computer applications. Assis...,70014023
2,"Kforce has a client in Austin, Texas (TX) that...",70241308
3,"*We believe that*, when done right, investing ...",70543388
4,**Description:** \nBaylor St. Luke’s Medical ...,70543468
5,The position will manage efforts and lead staf...,70546699
6,Job Duties: Filtering data by utilizing vari...,70547438
7,Facebook was built to help people connect and ...,70548340
8,Senior Systems Specialist - Data Center Facili...,70551885
9,Associate Systems Specialist - Data Center Fac...,70551922


In [60]:
# creating an object from the Extract class
se = Skill_Extractor() # runs __init__() method

GPU is available. Using GPU for Fine-tuned Language model initialization.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [61]:
# skills output based on the taxonomy database
output = se.extractor(nlx_sample, 'job_id', 'description')

Extracted_skills:  ['Responsible', 'Able', 'Clear', 'Strong', 'Excellent', 'Leadership', 'Consulting', 'Turning', 'Innovative']
Extracted_skills:  ['Computer Applications', 'Mechanical Inclined', 'Technical Repairs', 'Office Detail', 'Filing', 'Answering Phones', 'Scheduling', 'Product Delivery']


  extracted = extracted._append(aligned_skills, ignore_index=True)


Extracted_skills:  ['SQL', 'Data Migration', 'Financial Services Domain', 'SQL Server Development', 'Data Analysis']
Extracted_skills:  ['Data Modeling', 'Data Analysis', 'Data Visualization', 'ETL/ELT', 'Business Intelligence', 'Data Warehousing', 'Information Architecture', 'Data Architecture', 'Strategic Partnerships', 'Diversity and Inclusion', 'Data Governance', 'Data Management', 'Business Operations', 'Project Management']
Extracted_skills:  []
Extracted_skills:  ['Leadership', 'Business Intelligence (BI) / Advanced Analytics', 'Data Management', 'Statistical Modeling', 'Data Integration', 'Big Data Implementation', 'Team Management', 'Communication', 'Problem Solving', 'Critical Thinking']
Extracted_skills:  ['Data Analysis', 'Communication', 'Database Design', 'Data Modeling', 'Data Reporting', 'Data Visualization', 'Data Testing', 'Data Quality', 'Data Analysis', 'Excel', 'Statistical Analysis', 'Information Management']
Extracted_skills:  ['Networks', 'Server Hardware', 'Lin

In [62]:
# save the extracted skills to a csv file
print(output)
output.to_csv('extracted_skills_for_50Jobs.csv', index=False)

     Research ID                               Raw Skill Skill Tag  \
0       70014023                   Computer Applications    OSN.84   
1       70014023                   Computer Applications   OSN.128   
2       70014023                   Computer Applications   OSN.129   
3       70014023                   Computer Applications   OSN.131   
4       70014023                   Computer Applications   OSN.137   
...          ...                                     ...       ...   
9608    66143667  Troubleshooting and Problem Resolution   OSN.763   
9609    66143667  Troubleshooting and Problem Resolution   OSN.787   
9610    66143667  Troubleshooting and Problem Resolution   OSN.813   
9611    66143667  Troubleshooting and Problem Resolution   OSN.821   
9612    66143667  Troubleshooting and Problem Resolution   OSN.850   

      Correlation Coefficient  
0                    0.856481  
1                    0.857496  
2                    0.856146  
3                    0.865687  