In [1]:
import re
import json
from transformers import AutoTokenizer
import torch


def process_experience(experience_dict: dict) -> dict:
    """
    Function to preprocess an experience from a LinkedIn profile by cleaning text fields.
    The function extracts the company, title, description, and location fields from the input dictionary.
    It then cleans up the text by removing extra whitespace and new line characters.
    
    :param experience_dict: A dictionary containing profile information.

    :return: A preprocessed dictionary with cleaned text fields.
    """
    # Define the keys to extract and clean
    keys_to_extract = ['company', 'title', 'description', 'location']
    
    # Initialize an empty dictionary to store the preprocessed data
    preprocessed_data = {}
    
    # Iterate through the required keys
    for key in keys_to_extract:
        # Extract the value from the profile dictionary or use an empty string if the key is not present
        value = experience_dict.get(key, '')
        
        # Clean up text
        if value:
            cleaned_value = re.sub(r'\s+', ' ', value.strip())
        else:
            cleaned_value = None
        
        # Store the cleaned value in the preprocessed data dictionary
        preprocessed_data[key] = cleaned_value
    
    # Return the preprocessed data
    return preprocessed_data


def prepare_experience_for_model_input(preprocessed_dict: dict, system_prompt: str, tokenizer: AutoTokenizer) -> dict:
    """
    Function to prepare the model input based on the preprocessed dictionary.
    The function constructs a prompt message using the system prompt and the preprocessed dictionary.
    It then tokenizes the prompt using the provided tokenizer and returns the input IDs for the model.

    :param preprocessed_dict: The preprocessed experience as a dictionary.
    :param system_prompt: The prompt to use for the model input.
    :param tokenizer: The tokenizer to use for encoding the input.

    :return: The input IDs for the model (aka the tokens).
    """
    # Convert the preprocessed dictionary to a JSON string and construct the prompt
    prompt_messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": json.dumps(preprocessed_dict, indent=None)},
    ]

    # Tokenize the prompt
    input_ids = tokenizer.apply_chat_template(
        prompt_messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )
    
    # Return the input IDs
    return input_ids


def preprocess_and_save_experiences(input_file_path, output_path, system_prompt: str, model_id_or_path: str):
    """
    Function to preprocess a JSON file containing the LinkedIn profiles and save the processed data to a new file.
    The function reads the input JSON file, preprocesses the experiences, and prepares the model input.
    It then saves the processed data to a new JSON file (object id + list of encoded experiences).
    
    :param input_file_path: The path to the input JSON file.
    :param output_path: The path to save the output JSON file.
    :param system_prompt: The prompt to use for the model input.
    :param model_id_or_path: The identifier or path for the model to use for tokenization.
    """
    # Initialize a list to store the processed results
    total_results = []
    
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id_or_path)
    
    # Open and load the JSON file
    with open(input_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        
    # Initialize counter
    profile_counter = 0

    # Iterate over each profile in the data
    for profile in data:
        # Initialize a counter for every profile
        experience_list_counter = 1
        
        # Iterate over each experience in the profile
        for experience in profile["experiences"]:            
            # Preprocess the experience
            preprocessed_experience = process_experience(experience)
            
            # Prepare the experience for the model input
            encoded_experience = prepare_experience_for_model_input(
                preprocessed_dict=preprocessed_experience, 
                system_prompt=system_prompt,
                tokenizer=tokenizer,
            )
            
            # Serialize each experience individually and save it to a file
            torch.save(encoded_experience, f'{output_path}/input_{profile["_id"]["$oid"]}_{experience_list_counter}.pt')
            
            # Up the counter
            experience_list_counter += 1
        
        # Up and print profile counter
        profile_counter += 1
        print(profile_counter)

In [12]:
test_experiences = [{
      "company": "University of Engineering and Technology Peshawar, Pakistan",
      "title": "R&D Consultant",
      "description": "UET, Peshawar has undertaken various projects that required the expertise of a R&D based Biomedical Engineer. I provided my expertise to the students and staff in these projects and increased the productivity of the department. \n\nAdditionally, I\n• Identified the need for 3D printers for the department\n• Investigated 3D printers that would be best suited for the training individuals\n• Designed a certified course to train the students and staff on the essentials of 3D printing\n• Trained individuals including but not limited to, students, professionals and artists",
      "location": "Pakistan"
    },
    {
      "company": "Google",
      "title": "Software Engineering Intern",
      "description": None,
      "location": "Venice, California"
    }
]

for experience in test_experiences:
    print("Before processing:")
    print(experience)
    print("\nAfter processing:")
    print(process_experience(experience))

Before processing:
{'company': 'University of Engineering and Technology Peshawar, Pakistan', 'title': 'R&D Consultant', 'description': 'UET, Peshawar has undertaken various projects that required the expertise of a R&D based Biomedical Engineer. I provided my expertise to the students and staff in these projects and increased the productivity of the department. \n\nAdditionally, I\n• Identified the need for 3D printers for the department\n• Investigated 3D printers that would be best suited for the training individuals\n• Designed a certified course to train the students and staff on the essentials of 3D printing\n• Trained individuals including but not limited to, students, professionals and artists', 'location': 'Pakistan'}

After processing:
{'company': 'University of Engineering and Technology Peshawar, Pakistan', 'title': 'R&D Consultant', 'description': 'UET, Peshawar has undertaken various projects that required the expertise of a R&D based Biomedical Engineer. I provided my ex

In [13]:
import os
import dotenv
    

dotenv.load_dotenv(dotenv.find_dotenv())

tokenizer = AutoTokenizer.from_pretrained(os.getenv("LLAMA_3_PATH"))

# Read the system prompt
with open('prompt.txt', 'r') as file:
    # Read the entire content of the file
    sys_prompt = file.read()

for experience in test_experiences:
    preprocessed_experience = process_experience(experience)
    tokenized_input = prepare_experience_for_model_input(
        process_experience(experience), sys_prompt, tokenizer)
    
    print("Encoded input:")
    print(tokenized_input)
    
    print("\nDecoded input:")
    print(tokenizer.decode(tokenized_input[0], skip_special_tokens=True))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Encoded input:
tensor([[128000, 128006,   9125, 128007,    271,   2675,    527,    459,  15592,
          18328,  28175,    304,  42118,  11704,    505,  33867,  21542,     13,
           3277,   3984,    449,    459,   3217,     11,    701,   3465,    374,
            311,  14532,    323,   8819,   1401,   3649,     11,  21760,    389,
          49183,    539,   1120,  11720,   7512,    719,   1101,  18479,   6305,
             11,   4279,   5064,  12659,     11,    323,    279,    503,  71921,
           1511,    304,    279,  10706,    627,   2520,    279,   2728,   3217,
             11,   7068,    264,   4823,   1665,    430,   5764,   8365,   1778,
            439,   5064,     11,   4913,     11,   7512,     11,    323,   9681,
             13,    578,  33289,   1288,   8881,   2225,   9539,  11224,   7512,
            323,  68695,   7512,   3196,    389,    279,   2317,    323,   5064,
          40851,     13,  21829,    279,   2768,   8365,    369,    279,   2612,
            5

In [None]:
import os
import dotenv
    

# Load environment variables
dotenv.load_dotenv(dotenv.find_dotenv())

# Read the system prompt
with open('prompt.txt', 'r') as file:
    # Read the entire content of the file
    sys_prompt = file.read()

# Run the preparation function
preprocess_and_save_experiences(
    input_file_path=os.getenv("INPUT_FILE_PATH"),
    output_path= os.getenv("OUTPUT_PATH"),
    system_prompt=sys_prompt,
    model_id_or_path='microsoft/Phi-3-mini-128k-instruct'
)