In [8]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer_qwen = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
model_qwen = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", temperature = 0.1, top_p = 0.6, do_sample=True)

In [4]:
import yaml

# Load the YAML data
with open("./me.yaml", "r") as file:
    data = yaml.safe_load(file)

# Process the education data
education = data["education"]
education_text = "\n".join(
    [f"- Degree: {e['degree']}, Institution: {e['institution']}, Graduation Date: {e['graduation_date']}" for e in education]
)

# Process the work experience data
work = data["work_experience"]
work_text = "\n".join(
    [
        f"- Role: {w['title']}, Company: {w['company']}, Start Date: {w['start_date']}, End Date: {w.get('end_date', 'Present')}, Work Description: {w['achievements']}"
        for w in work
    ]
)

# Process the skills data
skills = data["skills"]
skills_text = (
    f"Programming: {skills['programming_languages']}, Tools: {skills['tools']}, Languages: {skills['languages']}"
)


In [5]:
system_prompt = '''
You are an AI assistant of Henry Liang.
You are designed to assist users with questions about Henry Liang, his work, and his experience.

**Tone and Scope:**
Answer all questions cheerfully, but do not provide more information than what is explicitly asked.

**Uncertainty:**
If you do not know the answer, state clearly, "I do not know." Avoid guessing or fabricating information.

**Restrictions:**
Do not answer questions that are inappropriate, harmful, racist, or illegal.
Avoid using inappropriate language under any circumstance.
Do not provide medical, legal, or financial advice.
Do not share any information that can identify or locate a person.
Follow these guidelines strictly, and always prioritize clarity, accuracy, and adherence to the scope of your role.
'''

def get_user_prompt(education_text, work_text, skills_text, q):
    # Combine the information into the final content string
    content = f"""
    The following is data about Henry Liang:

    **Education:**
    {education_text}

    **Work Experience:**
    {work_text}

    **Skills:**
    {skills_text}

    If it is a question about Henry Liang, use the above information to answer the following question about Henry Liang.

    {q}
    """

    return content


In [6]:
qa_dataset = [
    # Education-related questions
    {
        "question": "Where did Henry complete his master's degree?",
        "answer": "Henry completed his M.S. in Machine Learning and Data Science at Northwestern University, graduating in December 2023."
    },
    {
        "question": "What was his undergraduate degree in?",
        "answer": "His undergraduate degree was a B.S. in Applied Mathematics and Statistics from the University of California, Los Angeles, which he completed in March 2022."
    },
    {
        "question": "Which institutions did Henry attend for his higher education?",
        "answer": "Henry attended Northwestern University for his master's degree and the University of California, Los Angeles, for his bachelor's degree."
    },

    # Work experience-related questions
    {
        "question": "What is Henry's current job title?",
        "answer": "Henry's current job title is Data Scientist II at Vail Systems, Inc."
    },
    {
        "question": "What major achievement did Henry accomplish at Vail Systems?",
        "answer": "Henry created a novel hybrid embedding that reduced the no-context rate for RFP completion by 83.33%, leading to a paper acceptance and invited talk at the AI-ML Systems Conference in October 2024."
    },
    {
        "question": "What framework did Henry use to build the FAQ chatbot at Vail Systems?",
        "answer": "He used a Retrieval-Augmented Generation (RAG) framework to build the FAQ chatbot."
    },
    {
        "question": "What was the improvement in FAQ system latency that Henry achieved at Vail Systems?",
        "answer": "Henry reduced FAQ system latency by 65% through data curation, embedding optimization, and HNSW indexing with PostgreSQL."
    },
    {
        "question": "What did Henry work on during his internship at Amazon?",
        "answer": "During his Amazon internship, Henry developed a custom LLM app to explain knowledge graph differences and detect network routing anomalies, achieving 99%\ accuracy in summarizing path-related attributes with KGAG."
    },
    {
        "question": "Which company did Henry intern at before Amazon?",
        "answer": "Before Amazon, Henry interned at Roblox as a Data Science Intern."
    },
    {
        "question": "What early warning system did Henry develop at Roblox?",
        "answer": "Henry developed a 94%\ accurate early warning system using an XGBoost model to forecast significant impression shifts."
    },
    {
        "question": "What real-time forecasting system did Henry design at Northwestern University?",
        "answer": "Henry designed a scalable AWS IoT streaming pipeline for real-time time-series forecasting, achieving a throughput of 55 predictions per second."
    },

    # Skills and tools-related questions
    {
        "question": "Which programming languages is Henry proficient in?",
        "answer": "Henry is proficient in Python, R, PostgreSQL, Java, and JavaScript."
    },
    {
        "question": "What big data tools does Henry have experience with?",
        "answer": "Henry has experience with Apache Spark and Apache Hadoop."
    },
    {
        "question": "What containerization tools does Henry use?",
        "answer": "Henry uses Docker and Kubernetes for containerization."
    },
    {
        "question": "What languages can Henry speak?",
        "answer": "Henry speaks English, Mandarin, and French."
    },
    {
        "question": "What tools has Henry used for machine learning?",
        "answer": "Henry has used TensorFlow, PyTorch, LangChain, and Hugging Face for machine learning."
    },
    {
        "question": "What cloud service did Henry use in his real-time forecasting project?",
        "answer": "Henry used AWS IoT pipelines and AWS EC2 for his real-time forecasting project."
    },
]


In [7]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-English")
model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-English")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "My name is Wolfgang and I live in Berlin"

ner_results = nlp(example)
print(ner_results)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-English were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


[{'entity': 'I-PER', 'score': 0.99998474, 'index': 4, 'word': '▁Wolfgang', 'start': 11, 'end': 19}, {'entity': 'I-LOC', 'score': 0.9999943, 'index': 9, 'word': '▁Berlin', 'start': 34, 'end': 40}]


In [2]:
# import spacy

# nlp = spacy.load("en_core_web_lg")
# # prompt = '''He has used the following tools for big data processing:
# # - TensorFlow
# # - PyTorch
# # - LangChain
# # - Hugging Face
# # - Git
# # - Docker
# # - Kubernetes
# # - Apache Spark
# # - Apache Hadoop'''
# # hypothesis = "You graduated in 2023 with a master's from ABC University, and currently doing a PhD at XYZ University."

# # # Prompt and hypothesis
# prompt = "I graduated with a master's in December 2023 from XYZ University."
# hypothesis = "You graduated in 2023 with a master's from ABC University, and currently doing a PhD at XYZ University."

# # Extract entities
# def extract_entities(text):
#     doc = nlp(text)
#     return [(ent.text, ent.label_) for ent in doc.ents]

# def extract_dates(entities):
#     new_entities = []
#     for ent, label in entities:
#         if label == "DATE":
#             splits = ent.split(" ")
#             for split in splits:
#                 new_entities.append((split, label))
#         else:
#             new_entities.append((ent, label))
#     return new_entities

# prompt_entities = extract_dates(extract_entities(prompt))
# hypothesis_entities = extract_dates(extract_entities(hypothesis))

# # Compare entities
# hallucinated = [ent for ent in hypothesis_entities if ent not in prompt_entities]
# non_hallucinated = [ent for ent in hypothesis_entities if ent in prompt_entities]


# print("Prompt Entities:", prompt_entities)
# print("Hypothesis Entities:", hypothesis_entities)
# print("Hallucinated Entities:", hallucinated)
# print("Non-Hallucinated Entities:", non_hallucinated)

Prompt Entities: [('December', 'DATE'), ('2023', 'DATE'), ('XYZ University', 'ORG')]
Hypothesis Entities: [('2023', 'DATE'), ('ABC University', 'ORG'), ('PhD', 'WORK_OF_ART'), ('XYZ University', 'ORG')]
Hallucinated Entities: [('ABC University', 'ORG'), ('PhD', 'WORK_OF_ART')]
Non-Hallucinated Entities: [('2023', 'DATE'), ('XYZ University', 'ORG')]


In [9]:
import spacy

nlp = spacy.load("en_core_web_lg")

# # Prompt and hypothesis
prompt = "I graduated with a master's in December 2023 from XYZ University."
hypothesis = "You graduated in 2023 with a master's from ABC University, and currently doing a PhD at XYZ University."

nlp_roberta = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy='simple')

def extract_entities_roberta(text):
    doc = nlp_roberta(text)
    return [(ent['word'], ent['entity_group']) for ent in doc]

# Extract entities
def extract_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

def extract_dates(entities):
    new_entities = []
    for ent, label in entities:
        if label == "DATE":
            splits = ent.split(" ")
            for split in splits:
                new_entities.append((split, label))
        else:
            new_entities.append((ent, label))
    return new_entities

def get_all_entities(prompt, hypothesis):
    prompt_entities = extract_dates(extract_entities(prompt))
    hypothesis_entities = extract_dates(extract_entities(hypothesis))

    prompt_entities_roberta = extract_dates(extract_entities_roberta(prompt))
    hypothesis_entities_roberta = extract_dates(extract_entities_roberta(hypothesis))

    prompt_entities = [t[0] for t in prompt_entities]
    prompt_entities_roberta = [t[0] for t in prompt_entities_roberta]

    hypothesis_entities = [t[0] for t in hypothesis_entities]
    hypothesis_entities_roberta = [t[0] for t in hypothesis_entities_roberta]

    combined_prompt_entities = list(set(prompt_entities_roberta + prompt_entities))
    combined_hypothesis_entities = list(set(hypothesis_entities_roberta + hypothesis_entities))
    # Compare entities
    hallucinated = [ent for ent in combined_hypothesis_entities if ent not in combined_prompt_entities]
    non_hallucinated = [ent for ent in combined_hypothesis_entities if ent in combined_prompt_entities]

    return combined_prompt_entities, combined_hypothesis_entities, hallucinated, non_hallucinated

all_entities = get_all_entities(prompt, hypothesis)

print("Prompt Entities:", all_entities[0])
print("Hypothesis Entities:", all_entities[1])
print("Hallucinated Entities:", all_entities[2])
print("Non-Hallucinated Entities:", all_entities[3])

Device set to use cuda:0


Prompt Entities: ['December', '2023', 'XYZ University']
Hypothesis Entities: ['PhD', 'ABC University', '2023', 'XYZ University']
Hallucinated Entities: ['PhD', 'ABC University']
Non-Hallucinated Entities: ['2023', 'XYZ University']


In [12]:
import torch
from sklearn.metrics import accuracy_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import spacy

# nlp = spacy.load("en_core_web_sm")
# def extract_entities(text):
#     doc = nlp(text)
#     return [(ent.text, ent.label_) for ent in doc.ents]


# Initialize metrics
results = []

for item in qa_dataset:
    q = item["question"]
    expected_answer = item["answer"]

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": get_user_prompt(education_text, work_text, skills_text, q)}
    ]
    text = tokenizer_qwen.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer_qwen([text], return_tensors="pt").to(model_qwen.device)

    generated_ids = model_qwen.generate(
        **model_inputs,
        max_new_tokens=512
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer_qwen.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    # Log generated response
    print(f"Question: {q}")
    print(f"Generated Answer: {response}")
    print(f"Expected Answer: {expected_answer}")
    print()

    prompt_entities, hypothesis_entities, hallucinated_entities, non_hallucinated_entities = get_all_entities(prompt=expected_answer, hypothesis=response) 

    if len(hypothesis_entities) == 0:
        ratio = 0
    else:
        ratio = len(hallucinated_entities) / len(hypothesis_entities)
    # Append to results
    results.append({
        "question": q,
        "expected_answer": expected_answer,
        "generated_answer": response,
        "prompt entities": prompt_entities,
        "hypothesis_entities": hypothesis_entities,
        "hallucinated_entities": hallucinated_entities,
        "non_hallucinated_entities": non_hallucinated_entities,
        "hallucinated_entities_in_response": ratio
    })

# Calculate average BLEU score
average_hallucination_rate = sum([result["hallucinated_entities_in_response"] for result in results]) / len(results)

# Log evaluation metrics
print(f"Average Hallucination Rate: {average_hallucination_rate:.4f}")

# Optional: Save results to a file for further analysis
import json
with open("evaluation_results.json", "w") as f:
    json.dump(results, f, indent=4)


Question: Where did Henry complete his master's degree?
Generated Answer: Henry completed his Master's degree from Northwestern University.
Expected Answer: Henry completed his M.S. in Machine Learning and Data Science at Northwestern University, graduating in December 2023.

Question: When did he graduate from UCLA with a bachelor's degree?
Generated Answer: To determine when Henry Liang graduated from UCLA, I need to look up his education record. Unfortunately, there is no specific record provided that lists his graduation date. However, based on the education details given, we can infer that his undergraduate studies were completed at UCLA, which would place him in the academic year ending in 2023.

Since he started working in 2024, and assuming he has been employed since then, we can calculate the start date of his education. Given that he was a Data Scientist II starting in 2024, if we assume he had a Master’s degree before becoming a Data Scientist, this might be around 2023.

Th

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Question: What major achievement did Henry accomplish at Vail Systems?
Generated Answer: Henry accomplished several significant achievements at Vail Systems:

1. He was recognized for creating a novel hybrid embedding reducing no-context rate for RFP completion by 83.33%. This led to a paper acceptance at the AI-ML Systems Conference.

2. He led the production of a FAQ chatbot using a RAG framework, achieving a top-5 retrieval accuracy of 99.18%.

3. He reduced the FAQ system's latency by 65%, demonstrating improved performance.

4. He optimized data curation, embedding optimization, and HNSW indexing techniques, resulting in a reduction in retrieval accuracy from 83.33% to 99.18%.

5. He extended the FAQ chatbot's functionality to support up to 10,000 documents, including API documentation and code samples.

These accomplishments highlight Henry's contributions to improving FAQ systems and enhancing overall user experience at Vail Systems.
Expected Answer: Henry created a novel hybrid

In [14]:
print(model_qwen)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((