In [None]:
import re
import json
from transformers import AutoTokenizer
import torch


def process_experience(experience_dict: dict) -> dict:
    """
    Function to preprocess an experience from a LinkedIn profile by cleaning text fields.
    The function extracts the company, title, description, and location fields from the input dictionary.
    It then cleans up the text by removing extra whitespace and new line characters.
    
    :param experience_dict: A dictionary containing profile information.

    :return: A preprocessed dictionary with cleaned text fields.
    """
    # Define the keys to extract and clean
    keys_to_extract = ['company', 'title', 'description', 'location']
    
    # Initialize an empty dictionary to store the preprocessed data
    preprocessed_data = {}
    
    # Iterate through the required keys
    for key in keys_to_extract:
        # Extract the value from the profile dictionary or use an empty string if the key is not present
        value = experience_dict.get(key, '')
        
        # Clean up text
        if value:
            cleaned_value = re.sub(r'\s+', ' ', value.strip())
        else:
            cleaned_value = None
        
        # Store the cleaned value in the preprocessed data dictionary
        preprocessed_data[key] = cleaned_value
    
    # Return the preprocessed data
    return preprocessed_data


def prepare_experience_for_model_input(preprocessed_dict: dict, system_prompt: str, tokenizer: AutoTokenizer) -> dict:
    """
    Function to prepare the model input based on the preprocessed dictionary.
    The function constructs a prompt message using the system prompt and the preprocessed dictionary.
    It then tokenizes the prompt using the provided tokenizer and returns the input IDs for the model.

    :param preprocessed_dict: The preprocessed experience as a dictionary.
    :param system_prompt: The prompt to use for the model input.
    :param tokenizer: The tokenizer to use for encoding the input.

    :return: The input IDs for the model (aka the tokens).
    """
    # Convert the preprocessed dictionary to a JSON string and construct the prompt
    prompt_messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": json.dumps(preprocessed_dict, indent=None)},
    ]

    # Tokenize the prompt
    input_ids = tokenizer.apply_chat_template(
        prompt_messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    )
    
    # Return the input IDs
    return input_ids


def preprocess_and_save_experiences(input_file_path, output_path, system_prompt: str, model_id_or_path: str):
    """
    Function to preprocess a JSON file containing the LinkedIn profiles and save the processed data to a new file.
    The function reads the input JSON file, preprocesses the experiences, and prepares the model input.
    It then saves the processed data to a new JSON file (object id + list of encoded experiences).
    
    :param input_file_path: The path to the input JSON file.
    :param output_path: The path to save the output JSON file.
    :param system_prompt: The prompt to use for the model input.
    :param model_id_or_path: The identifier or path for the model to use for tokenization.
    """
    # Initialize a list to store the processed results
    total_results = []
    
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id_or_path)
    
    # Open and load the JSON file
    with open(input_file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
        
    # Initialize counter
    profile_counter = 0

    # Iterate over each profile in the data
    for profile in data:
        # Initialize a counter for every profile
        experience_list_counter = 1
        
        # Iterate over each experience in the profile
        for experience in profile["experiences"]:            
            # Preprocess the experience
            preprocessed_experience = process_experience(experience)
            
            # Prepare the experience for the model input
            encoded_experience = prepare_experience_for_model_input(
                preprocessed_dict=preprocessed_experience, 
                system_prompt=system_prompt,
                tokenizer=tokenizer,
            )
            
            # Serialize each experience individually and save it to a file
            torch.save(encoded_experience, f'{output_path}/input_{profile["_id"]["$oid"]}_{experience_list_counter}.pt')
            
            # Up the counter
            experience_list_counter += 1
        
        # Up and print profile counter
        profile_counter += 1
        print(profile_counter)

In [None]:
test_experiences = [{
      "company": "University of Engineering and Technology Peshawar, Pakistan",
      "title": "R&D Consultant",
      "description": "UET, Peshawar has undertaken various projects that required the expertise of a R&D based Biomedical Engineer. I provided my expertise to the students and staff in these projects and increased the productivity of the department. \n\nAdditionally, I\n• Identified the need for 3D printers for the department\n• Investigated 3D printers that would be best suited for the training individuals\n• Designed a certified course to train the students and staff on the essentials of 3D printing\n• Trained individuals including but not limited to, students, professionals and artists",
      "location": "Pakistan"
    },
    {
      "company": "Google",
      "title": "Software Engineering Intern",
      "description": None,
      "location": "Venice, California"
    }
]

for experience in test_experiences:
    print("Before processing:")
    print(experience)
    print("\nAfter processing:")
    print(process_experience(experience))

In [None]:
import os
import dotenv
    

dotenv.load_dotenv(dotenv.find_dotenv())

tokenizer = AutoTokenizer.from_pretrained(os.getenv("LLAMA_3_PATH"))

# Read the system prompt
with open('prompt.txt', 'r') as file:
    # Read the entire content of the file
    sys_prompt = file.read()

for experience in test_experiences:
    preprocessed_experience = process_experience(experience)
    tokenized_input = prepare_experience_for_model_input(
        process_experience(experience), sys_prompt, tokenizer)
    
    print("Encoded input:")
    print(tokenized_input)
    
    print("\nDecoded input:")
    print(tokenizer.decode(tokenized_input[0], skip_special_tokens=True))

In [None]:
import os
import dotenv
    

# Load environment variables
dotenv.load_dotenv(dotenv.find_dotenv())

# Read the system prompt
with open('prompt.txt', 'r') as file:
    # Read the entire content of the file
    sys_prompt = file.read()

# Run the preparation function
preprocess_and_save_experiences(
    input_file_path=os.getenv("INPUT_FILE_PATH"),
    output_path= os.getenv("OUTPUT_PATH"),
    system_prompt=sys_prompt,
    model_id_or_path='microsoft/Phi-3-mini-128k-instruct'
)

# Phi-3 pipeline

In [None]:
import re
import json
import random
from transformers import AutoTokenizer


def preprocess_object(
        object_dict: dict, 
        keys_to_extract: list = ['company', 'title', 'description', 'location']
        ) -> dict:
    """
    This function preprocesses a dictionary containing profile information by cleaning text fields.
    
    :param object_dict: A dictionary containing profile information.
    :param keys_to_extract: A list of keys to extract and clean from the input dictionary.

    :return: A preprocessed dictionary with cleaned text fields.
    """
    
    # Initialize an empty dictionary to store the preprocessed data
    preprocessed_data = {}
    
    # Iterate through the required keys
    for key in keys_to_extract:
        # Extract the value from the profile dictionary or use an empty string if the key is not present
        value = object_dict.get(key, '')
        
        # Clean up text
        if value:
            cleaned_value = re.sub(r'\s+', ' ', value.strip())
        else:
            cleaned_value = None
        
        # Store the cleaned value in the preprocessed data dictionary
        preprocessed_data[key] = cleaned_value
    
    # Return the preprocessed data
    return preprocessed_data


def format_prompt(input_dict: dict, system_prompt: str, model_tokenizer: AutoTokenizer) -> str:
    """
    Function to format the input dictionary as a prompt message for the model.

    :param input_dict: The input dictionary to format.
    :param system_prompt: The system prompt to use for the model input.

    :return: The formatted prompt message as a single string.
    """
    # Formate and return the inputs as a single string
    return model_tokenizer.apply_chat_template(
        [
            {"role": "user", "content": system_prompt},
            {"role": "user", "content": json.dumps(input_dict, indent=None)}
        ],
        tokenize=False,
        add_generation_prompt=True
    )


def encode_prompt(string_prompt: str, tokenizer: AutoTokenizer, seed: int = random.randint(0, 1000)):
    """

    """
    # Set seed for reproducibility
    torch.random.manual_seed(seed)

    # Tokenize the prompt
    input_ids = tokenizer(string_prompt, return_tensors="pt")
    
    # Return the input IDs
    return input_ids

In [None]:
import os
import dotenv
import torch
from transformers import AutoTokenizer 


# Load environment variables
dotenv.load_dotenv(dotenv.find_dotenv())

# Set the output path
output_path = os.getenv("OUTPUT_PATH")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-128k-instruct')

# Read the system prompt
with open('prompt.txt', 'r') as file:
    sys_prompt = file.read()

# Open and load the JSON file
with open(os.getenv("INPUT_FILE_PATH"), 'r', encoding='utf-8') as file:
    data = json.load(file)

# Find the length of the longest prompt
max_prompt_length = 0

for profile in data:
    for experience in profile["experiences"]:
        preprocessed_experience = preprocess_object(experience, keys_to_extract=['company', 'title', 'description'])
        prompt_length = len(format_prompt(preprocessed_experience, system_prompt=sys_prompt, model_tokenizer=tokenizer))
        max_prompt_length = max(max_prompt_length, prompt_length)

# Initialize counter
profile_counter = 0

# Iterate over each profile in the data
for profile in data:
    experience_list_counter = 1
    
    for experience in profile["experiences"]:
        preprocessed_experience = preprocess_object(experience, keys_to_extract=['company', 'title', 'description'])
        formatted_experience_prompt = format_prompt(preprocessed_experience, system_prompt=sys_prompt, model_tokenizer=tokenizer)
        
        # Encode the prompt with padding
        encoded_prompt = tokenizer(
            formatted_experience_prompt,
            #padding='max_length',
            #max_length=max_prompt_length,
            truncation=False,
            return_tensors='pt'
        )
        
        encoded_experience = {
            'input_ids': encoded_prompt['input_ids'][0],
            'attention_mask': encoded_prompt['attention_mask'][0]
        }
        print(encoded_experience)
        
        torch.save(encoded_experience, f'{output_path}/input_{profile["_id"]["$oid"]}_{experience_list_counter}.pt')
        
        experience_list_counter += 1
    
    profile_counter += 1
    print(profile_counter)

In [None]:
import torch
from transformers import AutoModelForCausalLM
import time


# Load the model
model = AutoModelForCausalLM.from_pretrained('microsoft/Phi-3-mini-128k-instruct')

# Set the paths to the token files
file_path_1 = ""
file_path_2 = ""

# Load the token files
encoded_experience_1 = torch.load(file_path_1)
encoded_experience_2 = torch.load(file_path_2)

# Move the model and input tensors to the GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
input_ids_1 = encoded_experience_1['input_ids'].to(device)
input_ids_2 = encoded_experience_2['input_ids'].to(device)

# Generate output for the first token file
start_time_1 = time.time()
with torch.no_grad():
    output_1 = model.generate(input_ids=input_ids_1, max_new_tokens=128)
end_time_1 = time.time()

# Generate output for the second token file
start_time_2 = time.time()
with torch.no_grad():
    output_2 = model.generate(input_ids=input_ids_2, max_new_tokens=128)
end_time_2 = time.time()

# Calculate the time difference for each generation
time_diff_1 = end_time_1 - start_time_1
time_diff_2 = end_time_2 - start_time_2

print(f"Generation time for token file 1: {time_diff_1:.4f} seconds")
print(f"Generation time for token file 2: {time_diff_2:.4f} seconds")

In [None]:
import torch

file_path = "input_6656faee44160735d73fa8df_3.pt"

# Load the saved file
loaded_data = torch.load(file_path)

# Check if the loaded data contains the attention mask
if 'attention_mask' in loaded_data:
    print("Attention mask is present in the loaded data.")
    attention_mask = loaded_data['attention_mask']
    print("Attention mask shape:", attention_mask.shape)
else:
    print("Attention mask is not present in the loaded data.")

In [None]:
import os
import dotenv
import torch
from transformers import AutoTokenizer 


# Load environment variables
dotenv.load_dotenv(dotenv.find_dotenv())

# Set the output path
output_path = os.getenv("OUTPUT_PATH")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-128k-instruct')

# Read the system prompt
with open('prompt.txt', 'r') as file:
    sys_prompt = file.read()

# Open and load the JSON file
with open(os.getenv("INPUT_FILE_PATH"), 'r', encoding='utf-8') as file:
    data = json.load(file)

# Initialize counter
profile_counter = 0

# Collect all the formatted prompts
formatted_prompts = []

# Iterate over each profile in the data
for profile in data:
    for experience in profile["experiences"]:
        preprocessed_experience = preprocess_object(experience, keys_to_extract=['company', 'title', 'description', 'location'])
        formatted_experience_prompt = format_prompt(preprocessed_experience, system_prompt=sys_prompt, model_tokenizer=tokenizer)
        formatted_prompts.append(formatted_experience_prompt)

# Encode the prompts with padding
encoded_prompts = tokenizer(formatted_prompts, padding=True, truncation=True, return_tensors='pt')

# Iterate over each profile in the data again
for profile in data:
    experience_list_counter = 1
    
    for experience in profile["experiences"]:
        encoded_tokens = encoded_prompts['input_ids'][profile_counter]
        attention_mask = encoded_prompts['attention_mask'][profile_counter]
        
        encoded_experience = {
            'input_ids': encoded_tokens,
            'attention_mask': attention_mask
        }
        
        torch.save(encoded_experience, f'{output_path}/input_{profile["_id"]["$oid"]}_{experience_list_counter}.pt')
        
        experience_list_counter += 1
    
    profile_counter += 1
    print(profile_counter)

In [None]:
import os
import dotenv
import torch
from transformers import AutoTokenizer 


# Load environment variables
dotenv.load_dotenv(dotenv.find_dotenv())

# Set the output path
output_path = os.getenv("OUTPUT_PATH")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-128k-instruct')

# Read the system prompt
with open('prompt.txt', 'r') as file:
    # Read the entire content of the file
    sys_prompt = file.read()

# Open and load the JSON file
with open(os.getenv("INPUT_FILE_PATH"), 'r', encoding='utf-8') as file:
    data = json.load(file)
    
# Initialize counter
profile_counter = 0

# Iterate over each profile in the data
for profile in data:
    # Initialize a counter for every profile
    experience_list_counter = 1
    
    # Iterate over each experience in the profile
    for experience in profile["experiences"]:
        # Preprocess the experience
        preprocessed_experience = preprocess_object(
            experience, 
            keys_to_extract=['company', 'title', 'description', 'location']
            )
        
        # Format the experience as a prompt
        formatted_experience_prompt = format_prompt(
            preprocessed_experience,
            system_prompt=sys_prompt,
            model_tokenizer=tokenizer
            )

        # Encode the prompt
        encoded_experience = encode_prompt(
            formatted_experience_prompt,
            tokenizer=tokenizer,
            seed=1
            )
        
        # Serialize each experience individually and save it to a file
        torch.save(encoded_experience, f'{output_path}/input_{profile["_id"]["$oid"]}_{experience_list_counter}.pt')
        
        # Up the counter
        experience_list_counter += 1
    
    # Up and print profile counter
    profile_counter += 1
    print(profile_counter)




# Tokenize and pad
encoding = tokenizer(texts, 
                    return_tensors="pt",
                    padding=True, 
                    truncation=True, 
                    max_length=128, 
                    add_special_tokens=True
                    )

# Move to device
inputs = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)

In [None]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-128k-instruct')

tokens = tokenizer.encode("How are you today? Can you say, fine?", return_tensors="pt", add_special_tokens=True)

print(tokens)

In [None]:
import torch
from transformers import AutoModelForCausalLM


torch.manual_seed(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    device_map=device, 
    torch_dtype="auto"
    trust_remote_code=True
)

In [None]:
result = model.generate(
    tokens.to(device), 
    max_new_tokens=256,
    do_sample=False
    )

print(result)

output = tokenizer.decode(result[0], skip_special_tokens= True)

print(output)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

messages = [
    {"role": "system", "content": "Your are a python developer."},
    {"role": "user", "content": "Help me generate a bubble algorithm"},
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 600,
    "return_full_text": False,
    "temperature": 0.3,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])



In [None]:





tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

messages = [
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
) # trust_remote_code=False

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])



test_experiences = [{
      "company": "University of Engineering and Technology Peshawar, Pakistan",
      "title": "R&D Consultant",
      "description": "UET, Peshawar has undertaken various projects that required the expertise of a R&D based Biomedical Engineer. I provided my expertise to the students and staff in these projects and increased the productivity of the department. \n\nAdditionally, I\n• Identified the need for 3D printers for the department\n• Investigated 3D printers that would be best suited for the training individuals\n• Designed a certified course to train the students and staff on the essentials of 3D printing\n• Trained individuals including but not limited to, students, professionals and artists",
      "location": "Pakistan"
    },
    {
      "company": "Google",
      "title": "Software Engineering Intern",
      "description": None,
      "location": "Venice, California"
    }
]


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-128k-instruct')

"""
for experience in test_experiences:
    print("Before processing:")
    print(experience)
    print("\nAfter processing:")
    print(process_experience(experience))
"""

# Read the system prompt
with open('prompt.txt', 'r') as file:
    # Read the entire content of the file
    sys_prompt = file.read()

test = format_prompt(test_experiences[0], sys_prompt, tokenizer)

print(test)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
tokens = tokenizer.tokenize(test, 
                            return_tensors="pt", 
                            add_special_tokens=True)

print(tokens)

output = model.generate(tokens)
print(output[0]['generated_text'])

In [None]:
import torch
import random
from transformers import AutoModelForCausalLM, AutoTokenizer


def format_prompt(input_dict: dict, system_prompt: str, model_tokenizer: AutoTokenizer) -> str:
    """
    Function to format the input dictionary as a prompt message for the model.

    :param input_dict: The input dictionary to format.
    :param system_prompt: The system prompt to use for the model input.

    :return: The formatted prompt message as a single string.
    """
    # Formate and return the inputs as a single string
    return model_tokenizer.apply_chat_template(
        [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": json.dumps(input_dict, indent=None)}
        ],
        tokenize=False,
        add_generation_prompt=True
    )

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('microsoft/Phi-3-mini-128k-instruct')
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    device_map=device, 
    torch_dtype="auto",
    trust_remote_code=True
)

# Example texts
texts = ["How are you today?", "Can you say, fine?", "Hello there! How's everything?"]

# Tokenize and pad
encoding = tokenizer(texts, 
                    return_tensors="pt",
                    padding=True, 
                    truncation=True, 
                    max_length=128, 
                    add_special_tokens=True
                    )

print(encoding)

# Move to device
inputs = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)

In [None]:
# Generate responses
outputs = model.generate(
    inputs, 
    attention_mask=attention_mask,
    max_length=256,
    max_new_tokens=50,  # Adjust based on your needs
    do_sample=False  # Or True if you want varied responses
)

# Decode the outputs
decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Print each response
for response in decoded_outputs:
    print(response)