In [1]:
import re


def process_experience(experience_dict: dict):
    """
    Preprocess a experience from a LinkedIn profile by cleaning text fields.
    
    :param experience_dict: A dictionary containing profile information.

    :return: A preprocessed dictionary with cleaned text fields.
    """
    # Define the keys to extract and clean
    keys_to_extract = ['company', 'title', 'description', 'location']
    
    # Initialize an empty dictionary to store the preprocessed data
    preprocessed_data = {}
    
    # Iterate through the required keys
    for key in keys_to_extract:
        # Extract the value from the profile dictionary or use an empty string if the key is not present
        value = experience_dict.get(key, '')
        
        # Clean up the text:
        # - Strip leading and trailing whitespace
        # - Replace multiple spaces with a single space
        # - Replace new line characters and tabs with a single space
        if value:
            cleaned_value = re.sub(r'\s+', ' ', value.strip())
        else:
            cleaned_value = None
        
        # Store the cleaned value in the preprocessed data dictionary
        preprocessed_data[key] = cleaned_value
    
    # Return the preprocessed data
    return preprocessed_data

In [2]:
import json
from transformers import AutoTokenizer


def prepare_experiences_for_model_input(preprocessed_dict: dict, tokenizer: AutoTokenizer):
    """
    A function to prepare the model input based on the preprocessed dictionary.

    :param preprocessed_dict: The preprocessed experience as a dictionary information.
    :param tokenizer: The tokenizer to use for encoding the input.

    :return: The model input as text or tokens.
    """
    # Convert the preprocessed dictionary to a JSON string and construct the prompt
    messages = [
        {"role": "system", "content": """You are an AI assistant specialized in analyzing experiences from LinkedIn profiles. When provided with an entry, your task is to identify and extract key details based on the structure of the input. Use the information given to generate a JSON object that includes specific attributes like company type, job type, tags, and keywords. The output JSON should adhere to this format: {"company type": "<type_of_company_or_null>", "job type": "<type_of_job_or_null>", "tags": ["<tag1>", "<tag2>", "..."], "keywords": ["<keyword1>", "<keyword2>", "..."]} If an attribute is not mentioned in the input, set its value to null. Interpret the input responsibly to extract the relevant information without creating or assuming details not present in the input."""},
        {"role": "user", "content": f"For the given dictionary below, please extract the specified attributes: {json.dumps(preprocessed_dict, indent=None)} Please extract the relevant information from the provided dictionary and generate the JSON output accordingly."},
    ]

    # Convert the messages into a prompt
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_tensors="pt"
    )
    """
    # Tokenize the prompt
    input_ids = tokenizer(
        prompt,
        return_tensors="pt"
    )
    """
    # Return the input IDs
    return input_ids

In [3]:
test_experience = {
      "starts_at": {
        "day": 1,
        "month": 5,
        "year": 2012
      },
      "ends_at": {
        "day": 31,
        "month": 8,
        "year": 2012
      },
      "company": "Limited Brands",
      "company_linkedin_profile_url": "https://www.linkedin.com/company/lbrands",
      "title": "Raw Materials Intern- MAST Global",
      "description": "- Supported the day to day management of the print and pattern process\n- Worked cross functionally with Design, Tech Design, and Pre-production\n- Sat in on sketch line review and created PowerPoints to be sent out to our regional partners in order to receive costing\n- Communicated with our regional partners to follow up on lab-dips, strike-offs, and sampling as requested by the Design team\n- Completed numerous projects that were presented to co-workers for eye opening opportunities to further progress the growth of the brand",
      "location": "Columbus, Ohio Area",
      "logo_url": "https://media-exp1.licdn.com/dms/image/C4E0BAQHD8okj9rA0EQ/company-logo_100_100/0/1547228096275?e=1655942400&v=beta&t=2y-txs4X8PWRyl-UFfX_lGv0JtryM8MA7AQqNN6wlqo"
    }

result = process_experience(test_experience)

print(result)

{'company': 'Limited Brands', 'title': 'Raw Materials Intern- MAST Global', 'description': '- Supported the day to day management of the print and pattern process - Worked cross functionally with Design, Tech Design, and Pre-production - Sat in on sketch line review and created PowerPoints to be sent out to our regional partners in order to receive costing - Communicated with our regional partners to follow up on lab-dips, strike-offs, and sampling as requested by the Design team - Completed numerous projects that were presented to co-workers for eye opening opportunities to further progress the growth of the brand', 'location': 'Columbus, Ohio Area'}


In [None]:
from huggingface_hub import notebook_login


notebook_login()

In [5]:
import transformers


# Load the model and tokenizer
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)

# Preprocess the experience
result = process_experience(test_experience)

# Get tokenized input ready for the model
tokenized_input = prepare_experiences_for_model_input(result, tokenizer)

# Print the tokenized input
print(tokenizer.decode(tokenized_input[0]))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an AI assistant specialized in analyzing experiences from LinkedIn profiles. When provided with an entry, your task is to identify and extract key details based on the structure of the input. Use the information given to generate a JSON object that includes specific attributes like company type, job type, tags, and keywords. The output JSON should adhere to this format: {"company type": "<type_of_company_or_null>", "job type": "<type_of_job_or_null>", "tags": ["<tag1>", "<tag2>", "..."], "keywords": ["<keyword1>", "<keyword2>", "..."]} If an attribute is not mentioned in the input, set its value to null. Interpret the input responsibly to extract the relevant information without creating or assuming details not present in the input.<|eot_id|><|start_header_id|>user<|end_header_id|>

For the given dictionary below, please extract the specified attributes: {"company": "Limited Brands", "title": "Raw Materials Intern- MA

In [6]:
import torch
from transformers import AutoModelForCausalLM


# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
).to(device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
# Send tokens to gpu
# tokenized_input_gpu = {key: value.to(device) for key, value in tokenized_input.items()}
tokenized_input_gpu = tokenized_input.to(device)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# Generate text based on the tokenized input
outputs = model.generate(
    tokenized_input_gpu, 
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9
)

# Print the generated text
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an AI assistant specialized in analyzing experiences from LinkedIn profiles. When provided with an entry, your task is to identify and extract key details based on the structure of the input. Use the information given to generate a JSON object that includes specific attributes like company type, job type, tags, and keywords. The output JSON should adhere to this format: {"company type": "<type_of_company_or_null>", "job type": "<type_of_job_or_null>", "tags": ["<tag1>", "<tag2>", "..."], "keywords": ["<keyword1>", "<keyword2>", "..."]} If an attribute is not mentioned in the input, set its value to null. Interpret the input responsibly to extract the relevant information without creating or assuming details not present in the input.<|eot_id|><|start_header_id|>user<|end_header_id|>

For the given dictionary below, please extract the specified attributes: {"company": "Limited Brands", "title": "Raw Materials Intern- MA