In [1]:
import re


def process_experience(experience_dict: dict):
    """
    Preprocess a experience from a LinkedIn profile by cleaning text fields.
    
    :param experience_dict: A dictionary containing profile information.

    :return: A preprocessed dictionary with cleaned text fields.
    """
    # Define the keys to extract and clean
    keys_to_extract = ['company', 'title', 'description', 'location']
    
    # Initialize an empty dictionary to store the preprocessed data
    preprocessed_data = {}
    
    # Iterate through the required keys
    for key in keys_to_extract:
        # Extract the value from the profile dictionary or use an empty string if the key is not present
        value = experience_dict.get(key, '')
        
        # Clean up the text:
        # - Strip leading and trailing whitespace
        # - Replace multiple spaces with a single space
        # - Replace new line characters and tabs with a single space
        if value:
            cleaned_value = re.sub(r'\s+', ' ', value.strip())
        else:
            cleaned_value = None
        
        # Store the cleaned value in the preprocessed data dictionary
        preprocessed_data[key] = cleaned_value
    
    # Return the preprocessed data
    return preprocessed_data

In [12]:
import json
from transformers import AutoTokenizer


def prepare_experiences_for_model_input(preprocessed_dict: dict, tokenizer: AutoTokenizer):
    """
    A function to prepare the model input based on the preprocessed dictionary.

    :param preprocessed_dict: The preprocessed experience as a dictionary information.
    :param tokenizer: The tokenizer to use for encoding the input.

    :return: The model input as text or tokens.
    """
    # Convert the preprocessed dictionary to a JSON string and construct the prompt
    prompt_messages = [
        {"role": "system", "content": """You are an AI assistant specialized in analyzing experiences from LinkedIn profiles. When provided with an entry, your task is to identify and extract key details based on the structure of the input. Use the information given to generate a JSON object that includes specific attributes like company type, job type, tags, and keywords. The output JSON should adhere to this format: {"company type": "<type_of_company_or_null>", "job type": "<type_of_job_or_null>", "tags": ["<tag1>", "<tag2>", "..."], "keywords": ["<keyword1>", "<keyword2>", "..."]} If an attribute is not mentioned in the input, set its value to null. Interpret the input responsibly to extract the relevant information without creating or assuming details not present in the input."""},
        {"role": "user", "content": f"For the given dictionary below, please extract the specified attributes: {json.dumps(preprocessed_dict, indent=None)} Please extract the relevant information from the provided dictionary and generate the JSON output accordingly."},
    ]

    # Tokenize the prompt
    input_ids = tokenizer.apply_chat_template(
        prompt_messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )
    
    # Return the input IDs
    return input_ids

In [9]:
test_experience = {
      "starts_at": {
        "day": 1,
        "month": 5,
        "year": 2012
      },
      "ends_at": {
        "day": 31,
        "month": 8,
        "year": 2012
      },
      "company": "Limited Brands",
      "company_linkedin_profile_url": "https://www.linkedin.com/company/lbrands",
      "title": "Raw Materials Intern- MAST Global",
      "description": "- Supported the day to day management of the print and pattern process\n- Worked cross functionally with Design, Tech Design, and Pre-production\n- Sat in on sketch line review and created PowerPoints to be sent out to our regional partners in order to receive costing\n- Communicated with our regional partners to follow up on lab-dips, strike-offs, and sampling as requested by the Design team\n- Completed numerous projects that were presented to co-workers for eye opening opportunities to further progress the growth of the brand",
      "location": "Columbus, Ohio Area",
      "logo_url": "https://media-exp1.licdn.com/dms/image/C4E0BAQHD8okj9rA0EQ/company-logo_100_100/0/1547228096275?e=1655942400&v=beta&t=2y-txs4X8PWRyl-UFfX_lGv0JtryM8MA7AQqNN6wlqo"
    }

result = process_experience(test_experience)

print(result)

{'company': 'Limited Brands', 'title': 'Raw Materials Intern- MAST Global', 'description': '- Supported the day to day management of the print and pattern process - Worked cross functionally with Design, Tech Design, and Pre-production - Sat in on sketch line review and created PowerPoints to be sent out to our regional partners in order to receive costing - Communicated with our regional partners to follow up on lab-dips, strike-offs, and sampling as requested by the Design team - Completed numerous projects that were presented to co-workers for eye opening opportunities to further progress the growth of the brand', 'location': 'Columbus, Ohio Area'}


In [None]:
import re


def process_experience(experience_dict: dict):
    """
    Preprocess a experience from a LinkedIn profile by cleaning text fields.
    
    :param experience_dict: A dictionary containing profile information.

    :return: A preprocessed dictionary with cleaned text fields.
    """
    # Define the keys to extract and clean
    keys_to_extract = ['company', 'title', 'description', 'location']
    
    # Initialize an empty dictionary to store the preprocessed data
    preprocessed_data = {}
    
    # Iterate through the required keys
    for key in keys_to_extract:
        # Extract the value from the profile dictionary or use an empty string if the key is not present
        value = experience_dict.get(key, '')
        
        # Clean up the text:
        # - Strip leading and trailing whitespace
        # - Replace multiple spaces with a single space
        # - Replace new line characters and tabs with a single space
        if value:
            cleaned_value = re.sub(r'\s+', ' ', value.strip())
        else:
            cleaned_value = None
        
        # Store the cleaned value in the preprocessed data dictionary
        preprocessed_data[key] = cleaned_value
    
    # Return the preprocessed data
    return preprocessed_data

In [13]:
import os
import dotenv
from transformers import AutoTokenizer


# Load environment variables
dotenv.load_dotenv(dotenv.find_dotenv())

# Load the model and tokenizer
model_id = os.getenv("LLAMA_3_PATH")
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Preprocess the experience
result = process_experience(test_experience)

# Get tokenized input ready for the model
tokenized_input = prepare_experiences_for_model_input(result, tokenizer)

# Print the tokenized input
print(tokenized_input)

# Print the tokenized input
print(tokenizer.decode(tokenized_input[0]))

# Print the tokenized input
print(tokenized_input)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


tensor([[128000, 128006,   9125, 128007,    271,   2675,    527,    459,  15592,
          18328,  28175,    304,  42118,  11704,    505,  33867,  21542,     13,
           3277,   3984,    449,    459,   4441,     11,    701,   3465,    374,
            311,  10765,    323,   8819,   1401,   3649,   3196,    389,    279,
           6070,    315,    279,   1988,     13,   5560,    279,   2038,   2728,
            311,   7068,    264,   4823,   1665,    430,   5764,   3230,   8365,
           1093,   2883,    955,     11,   2683,    955,     11,   9681,     11,
            323,  21513,     13,    578,   2612,   4823,   1288,  49553,    311,
            420,   3645,     25,   5324,  10348,    955,    794,   4145,   1337,
           3659,  34503,   8908,  15514,  21841,    330,   8975,    955,    794,
           4145,   1337,   3659,  20916,   8908,  15514,  21841,    330,  14412,
            794,   4482,     27,   4681,     16,  21841,   4145,   4681,     17,
          21841,  39813,   8

In [None]:
import torch
from transformers import AutoModelForCausalLM


# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto"
).to(device)

# Generate text based on the tokenized input
outputs = model.generate(tokenized_input) 

# Print the generated text
print(tokenizer.decode(outputs[0]))

In [None]:
import json


# Example usage
input_files = 'path_to_input_file.json'
output_files = 'path_to_output_file.json'

# Open the input JSON file
with open(input_files, 'r') as file:
    data = json.load(file)  # Load the data from JSON file

results = []  # Initialize an empty list to hold the results

# Iterate over each profile in the data
for profile in data:
    profile_id = profile["_id"]["$oid"]  # Extract the MongoDB Object ID

    # Simulate processing the experiences and generate a dummy result
    # Here you might replace this with the actual processing logic
    result = {
        "_id": profile_id,
        "results": "Processed results here"  # Placeholder for your actual results
    }
    results.append(result)  # Append the processed result to the list

# Save the processed data to a new JSON file
with open(output_file_path, 'w') as outfile:
    json.dump(results, outfile, indent=4)  # Write JSON data with indentation
