In [4]:
# Example experience from a LinkedIn profile
experience = {
    "company": "Old Dominion University",
    "company_linkedin_profile_url": "https://www.linkedin.com/company/old-dominion-university",
    "title": "Teaching Assistant",
    "description": "Key responsibilities: \n\n1. Assisting students in executing experimental laboratory works for Electronic Circuit Analysis. \n2. Administering exams, grading scripts and conducting problem solving sessions for Circuit Analysis I, Linear System Analysis, Electronic Circuit",
    "location": "Norfolk, VA, USA.",
    "logo_url": "https://media-exp1.licdn.com/dms/image/C4E0BAQG5Y7rGx9ZScQ/company-logo_100_100/0/1532523823768?e=1655942400&v=beta&t=sx3tGOZ4ApDAt8nOhp6purIOxuUNRyD1MP31VF-B0rA"
  }

# Example education from a LinkedIn profile
education = {
    "field_of_study": "Electrical and Computer Engineering",
    "degree_name": "Doctor of Philosophy - PhD",
    "school": "Old Dominion University",
    "school_linkedin_profile_url": "https://www.linkedin.com/school/old-dominion-university/",
    "description": "PhD Dissertation topic:\nInterface engineering of highly efficient organic-inorganic hybrid perovskite solar cells.",
    "logo_url": "https://media-exp1.licdn.com/dms/image/C4E0BAQG5Y7rGx9ZScQ/company-logo_100_100/0/1532523823768?e=1655942400&v=beta&t=sx3tGOZ4ApDAt8nOhp6purIOxuUNRyD1MP31VF-B0rA"
  }

## GPT
In addition to open source models i suggest trying comercial models as well.
All thought we might also wanna consider anthropic's stuff, we might have to limit this test to openai due to time constraints.

Create an account and save your api key to the OPENAI_API_KEY environment variable.
https://platform.openai.com/

In [None]:
import json


def openai_tag(dictionary_to_tag: dict, openai_client):
    # Set prompt
    messages = [{
        "role": "system",
        "content": "You are a helpful assistant that specializes in generating test data in JSON format. The user will provide you with a JSON structure and you generate a complete example JSON with realistic sounding values."}, {
        "role": "user",
        "content": str(dictionary_to_tag)
        }]

    # Generate a completion
    completion = openai_client.chat.completions.create(
        model="gpt-3.5-turbo", 
        response_format={ "type": "json_object" },
        messages=messages
    )

    # Get the generated text
    result_string = completion.choices[0].message.content
    
    # Convert the generated text into a dictionary
    tagged_dict = json.loads(result_string)
    
    # Return the dictionary
    return tagged_dict

In [None]:
import os
import dotenv
from openai import OpenAI


# Load the environment variables
dotenv.load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

# Create an instance of the OpenAI class
openai_api = OpenAI()

# Test tagging the education dictionary
result = openai_tag(education, openai_api)
print(result)

# Test tagging the experience dictionary
result = openai_tag(experience, openai_api)
print(result)

## CUDA support
Install the following software to run the code with CUDA support:

1. C++: https://visualstudio.microsoft.com/downloads/
2. CUDA toolkit: https://developer.nvidia.com/cuda-downloads?target_os=Windows&target_arch=x86_64&target_version=11&target_type=exe_local
3. Torch CUDA: https://pytorch.org/get-started/locally/

In [2]:
import torch


# Check if CUDA is available
print(torch.cuda.is_available())

True


In [3]:
import gc
from transformers import AutoModelForCausalLM


def check_memory_usage(model_path: str, bits: int = 32) -> float:
    """
    Calculates the memory usage of a model based on its number of parameters and the specified precision.

    Parameters:
    - model_path (str): The path or identifier of the model to be loaded.
    - bits (int): The number of bits per parameter. Common values are 16, 32, or 64 bits.

    Returns:
    - float: The estimated memory size of the model in megabytes (MB).
    """

    # Load the model 
    loaded_model = AutoModelForCausalLM.from_pretrained(model_path)

    # Calculate the model size in MB
    num_parameters = loaded_model.num_parameters()

    # Calculate memory usage based on the number of bits per parameter
    model_size_MB = num_parameters * (bits / 8) / (1024 ** 2)

    # Free up memory again
    del loaded_model
    gc.collect()

    return model_size_MB

## Mistral
Let's try out the TextBase-7B-v0.1 model from the SF-Foundation.
It is a instruction model based on the mistral and was selected because 
it scored highest on the hf leaderboard with a hella swag of 90.

Clone repo and save path to the TEXT_BASE_PATH environment variable.
https://huggingface.co/SF-Foundation/TextBase-7B-v0.1

In [5]:
import os
import dotenv
from transformers import AutoModelForCausalLM


# Load the environment variables
dotenv.load_dotenv()

# Check size 
size_in_mb = check_memory_usage(model_path=os.getenv("TEXT_BASE_PATH"), bits=16)

print(f"Model size: {size_in_mb:.2f} MB")

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Model size: 13812.51 MB


In [None]:
import os
import dotenv
import transformers


# Load the environment variables
dotenv.load_dotenv()

# Load the model
model = transformers.AutoModelForCausalLM.from_pretrained(os.getenv("TEXT_BASE_PATH"))

# Print the model size
print(f"Model size: {model.get_memory_usage():.2f} MB")

In [None]:
from transformers import pipeline


pipe = pipeline("text-generation", model="SF-Foundation/TextBase-7B-v0.1")

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("SF-Foundation/TextBase-7B-v0.1")
model = AutoModelForCausalLM.from_pretrained("SF-Foundation/TextBase-7B-v0.1")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM


def generate_output(input_text):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("/TextBase-7B-v0.1")
    model = AutoModelForCausalLM.from_pretrained("/TextBase-7B-v0.1")

    # Tokenize the input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Generate the output
    output = model.generate(
        input_ids,
        max_length=100,  # Adjust the maximum length of the generated output as needed
        num_return_sequences=1,  # Number of output sequences to generate
        temperature=0.7,  # Adjust the temperature for controlling the randomness of the output
    )

    # Decode the generated output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text

In [None]:
input_text = "Here is an example LinkedIn profile entry:\n..."
output = generate_output(input_text)
print("Generated output:")
print(output)

In [None]:
import os
import torch
import dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the environment variables
dotenv.load_dotenv()
model_path = os.getenv("MODEL_PATH")

def generate_output(input_text):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)

    # Move the model to the GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate the output
    output = model.generate(
        input_ids,
        max_length=300,
        temperature=0.7,
    )

    # Decode the generated output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return generated_text

def generate_output_stream(input_text):
    # Load the tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path)

    # Move the model to the GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate the output token by token
    for output in model.generate(
        input_ids,
        max_length=300,
        temperature=0.7,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        output_scores=True,
        return_dict_in_generate=True,
        num_beams=1,
    ):
        # Decode the generated token
        token = tokenizer.decode(output.sequences[:, -1], skip_special_tokens=True)

        # Yield the decoded token
        yield token

    # Finalize the generated text
    generated_text = tokenizer.decode(output.sequences[0], skip_special_tokens=True)
    yield generated_text

In [None]:
input_text = "This is a test, if the model is working correctly."
output = generate_output(input_text)
print("Generated output:")
print(output)

In [None]:
input_text = "Here is an example LinkedIn profile entry:\n..."
for token in generate_output(input_text):
    print(token, end="", flush=True)
    # Process the partial output if needed

## Llama 3 
I just wanned to try the new Llama3 (using 8b instruction variant).
To use it you have to create a meta account to request access.

Clone repo and save path to the LLAMA_3_PATH environment variable.
https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct

In [4]:
import os
import dotenv
from transformers import AutoModelForCausalLM


# Load the environment variables
dotenv.load_dotenv()

# Check size 
size_in_mb = check_memory_usage(model_path=os.getenv("LLAMA_3_PATH"), bits=16)

print(f"Model size: {size_in_mb:.2f} MB")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model size: 15316.51 MB


In [6]:
import json
import transformers


def llama_tag(dictionary_to_tag: dict, llm_pipeline: transformers.pipeline) -> dict:
    """
    This function takes a dictionary and tags it using the specified LLM pipeline.

    :param dictionary_to_tag: The dictionary to tag.
    :param llm_pipeline: The LLM pipeline to use for tagging.
    :return: The tagged dictionary (If an error occurs, the message is returned under the key error in the dict).
    """
    try:
        # Set prompt
        messages = [
            {"role": "system", "content": "You are a helpful assistant that specializes in generating test data in JSON format. The user will provide you with a JSON structure and you generate a complete example JSON with realistic sounding values."},
            {"role": "user", "content": str(dictionary_to_tag)}]

        
        # Define terminators
        terminators = [
        llm_pipeline.tokenizer.eos_token_id,
        llm_pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
        ]
        

        # Generate a completion
        outputs = llm_pipeline(
            messages,
            max_new_tokens=256,
            # eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
        )

        # Get the generated text
        result_string = outputs[0]["generated_text"][-1]
        
        # Convert the generated text into a dictionary
        tagged_dict = json.loads(result_string)

        # Return the dictionary
        return tagged_dict
    
    # Handle exceptions
    except transformers.errors.ModelOutputError as e:
        return {"error": str(e)}
    except json.JSONDecodeError:
        return {"error": "Failed to decode JSON"}
    except Exception as e:
        return {"error": str(e)}

In [None]:
import os
import dotenv
import transformers
import torch


# Load the environment variables
dotenv.load_dotenv()

# Set up the Llama model as a pipeline
llama_pipeline = transformers.pipeline(
    "text-generation",
    model=os.getenv("LLAMA_3_PATH"),
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto"
    )

# Test tagging the education dictionary
result = llama_tag(education, llama_pipeline)
print(result)

# Test tagging the experience dictionary
result = llama_tag(experience, llama_pipeline)
print(result)

In [None]:
import os
import torch
import transformers
import dotenv


dotenv.load_dotenv()
model_id = os.getenv("MODEL_PATH")

# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the model and tokenizer
model = transformers.AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

# Move the model to the GPU
model.to(device)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

# Tokenize the input messages
input_ids = tokenizer.encode(messages[0]["content"], return_tensors="pt").to(device)
for message in messages[1:]:
    message_ids = tokenizer.encode(message["content"], return_tensors="pt").to(device)
    input_ids = torch.cat([input_ids, message_ids], dim=-1)

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# Generate the output token by token
output_ids = model.generate(
    input_ids,
    max_length=input_ids.shape[1] + 256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    output_scores=True,
    return_dict_in_generate=True,
    num_beams=1,
)

# Stream the generated output
for i in range(input_ids.shape[1], output_ids.sequences.shape[1]):
    token = tokenizer.decode(output_ids.sequences[0, i], skip_special_tokens=True)
    print(token, end="", flush=True)

# Print the final generated text
generated_text = tokenizer.decode(output_ids.sequences[0], skip_special_tokens=True)
print("\n" + generated_text)

In [None]:
print(f"Model size: {model.get_memory_usage():.2f} MB")