In [12]:
import uuid # for conversation id
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer, util
from datetime import datetime
import openai
import random
import csv
import datetime
import time
import numpy as np
import pandas as pd
import os
import json
import re


#RAG 
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
#Incorporating langchain for easier integration and embedding document chunks into FAISS vector
#Rationale More Efficiency 
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings


#Metrics
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge import Rouge 
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score 
from nltk.tokenize import word_tokenize  #Import for tokenizing
from nltk.tokenize import TreebankWordTokenizer
from rouge_score import rouge_scorer
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from transformers import pipeline
import gradio as gr

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chrisgallevo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chrisgallevo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [28]:
#Prompt to enter OpenAI API key
openai.api_key = #####################################INSERT API KEY HERE#########################################

#Set Up Configuration
#######################################################################################################################
#Model settings
model_employee = 'gpt-4o'  
model_manager = 'gpt-4o' 

#Negotiation settings
num_negotiations = 1
max_turns = 10   #Maximum turns per negotiation
sim_token_limit = 5000     #Total token limit for a single negotiation


#######################################################################################################################
#Scenario generator to help generate different scenarios
def generate_random_scenario():
    job_roles = [
        "Data Scientist", "Software Developer", "Data Engineer", "Machine Learning Engineer",
        "Cloud Architect", "Cybersecurity Analyst", "DevOps Engineer", "Frontend Developer",
        "Backend Developer", "Full Stack Developer", "Data Analyst", "Business Analyst", "Financial Analyst",
        "Market Research Analyst", "Risk Analyst", "Quantitative Analyst", "Actuary", "BI (Business Intelligence) Developer",
        "Supply Chain Analyst", "Quality Assurance Analyst", "Solutions Architect","Management Consultant",
        "Strategy Consultant", "IT Consultant", "Financial Consultant", "Personal Banker"
    ]

    personalities = [
        "Assertive and Direct", "Empathetic and Understanding", "Optimistic and Cheerful", 
        "Pessimistic and Skeptical", "Logical and Analytical", "Data-Driven and Objective", 
        "Charismatic and Persuasive", "Cooperative and Team-Oriented", "Independent and Self-Assured",
        "Reserved and Introverted", "Outspoken and Confident", "Ambitious and Goal-Oriented",
        "Flexible and Open-Minded", "Competitive and Driven", "Thoughtful and Reflective",
        "Emotionally Reactive", "Calm and Composed", "Decisive and Pragmatic",
        "Detail-Oriented and Methodical", "Intuitive and Visionary", "Resourceful and Adaptable", 
        "Risk-Averse and Conservative", "Humorous and Lighthearted", "Respectful and Polite", 
        "Innovative and Forward-Thinking"
    ]

    employee_motivations = [
        "wants a raise to better support family due to increased living costs",
        "feels underpaid for the responsibilities taken on",
        "wants a raise in line with industry standards",
        "is planning to buy a house and needs higher income for mortgage",
        "believes they have grown in skills and wants to be recognized",
        "is seeking a salary increase after a successful project delivery",
        "has been receiving competitive offers from other companies",
        "is expecting a promotion and a corresponding raise",
        "has taken on additional responsibilities without a raise",
        "attained a higher education degree and is looking for a raise",
        "is aiming to save for a child's education", 
        "has been with the company for more than 5 years",
        "experiencing burnout and sees that a raise could alleviate this feeling",
        "doesn't have a performance review coming up soon or a raise schedule in place",
        "the company has reported strong earnings in its recent financial reports",
        "recently taken on more responsibility or started a new position",
        "has worked mainly at the office and does not have the option to work from home despite living long distance from work",
        "seeking a raise otherwise will need to relocate due to a better offer from a company in another city",
        "your managers frequently rely on you to pick up work from other team members",
        "received a high-paying offer from another company but doesn't want to leave your current role"
    ]

    manager_constraints = [
        "the company is facing budget constraints due to recent cost-cutting measures",
        "the company has posted record profits this quarter",
        "the department budget is tight, but employee retention is crucial",
        "management is prioritizing retention for critical roles",
        "the company has recently implemented a hiring freeze",
        "the company is on the brink of filing for bankruptcy",
        "the company has been acquired by another company in which new management will determine new compensation packages",
        "there is uncertainty in the market, making budgets more restrictive",
        "the company is launching a major initiative and needs to retain talent",
        "other employees in similar roles have not received raises recently",
        "HR policies require a thorough review before approving raises",
        "the company is undergoing an internal restructuring process"
    ]

    past_achievements = [
        "led three major projects that increased department productivity by 20%",
        "received an award for excellent customer feedback",
        "trained new hires and significantly reduced onboarding time",
        "automated a process that saved the company $50,000 annually",
        "initiated a successful cross-department collaboration project",
        "developed a tool that reduced report generation time by 50%",
        "solved a critical issue that prevented project delays worth millions",
        "mentored junior employees, improving their performance significantly",
        "secured a key client that generated $1 million in revenue",
        "proposed a new strategy that increased team efficiency"
    ]

    scenario = {
        "job_role": random.choice(job_roles),
        "employee_personality": random.choice(personalities),
        "hr_manager_personality": random.choice(personalities),
        "employee_motivation": random.choice(employee_motivations),
        "manager_constraint": random.choice(manager_constraints),
        "past_achievement": random.choice(past_achievements)}

    current_salary = round(random.randint(45000, 175000),-2)
    desired_percentage = random.uniform(0.05, 0.25) 
    desired_salary = round(current_salary + (current_salary * desired_percentage), -2)
    adjusted_desired_salary = desired_salary 

    return scenario, current_salary, desired_salary

#######################################################################################################################

#Function to track token usage
def token_usage_tracker(response, total_tokens_used, max_tokens):
    print(f"Response structure: {response}")
    #retrieve token usage
    used_tokens = response.get('usage', {}).get('total_tokens', 0)
    total_tokens_used += used_tokens

    print(f"Total tokens used in this turn: {used_tokens}. Total tokens so far: {total_tokens_used}/{max_tokens}")

    if total_tokens_used >= max_tokens:
        print("Maximum number of tokens for negotiation used. Must end conversation.")
    
    return total_tokens_used


#Conversation history
def generate_conversation_id():
    return str(uuid.uuid4())


#######################################################################################################################
#RAG Section
embedding_model = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2') 
rag_document_location = "RAG"

#Used to process PDFs, split into chunks
def lang_chain_pdf_puller(rag_document_location):
    all_chunks = []
    file_list = []

    #Raises an error should the folder not exist. 
    if not os.path.exists(rag_document_location):
        raise FileNotFoundError(f"The folder '{rag_document_location}' does not exist.")
    
    #Reduce long text to avoid maximizing big chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    for filename in os.listdir(rag_document_location):
        if filename.endswith(".pdf"):
            file_path = os.path.join(rag_document_location, filename)
            try:
                loader = PyPDFLoader(file_path)
                docs = loader.load()
                # Split documents into chunks
                chunks = text_splitter.split_documents(docs)
                # Extend Chunks
                all_chunks.extend(chunks)
                file_list.append(filename)
                print(f"Processed file: {filename} with {len(chunks)} chunks.")
            except Exception as e:
                print(f"Error processing {filename}: {e}")
    return all_chunks, file_list

all_chunks, file_list = lang_chain_pdf_puller(rag_document_location)
texts = [chunk.page_content for chunk in all_chunks]
vector_store = FAISS.from_texts(texts, embedding_model)

#Sifted through folder
def search_vector_db(query, vector_store, top_k=3):
    """Retrieve relevant information for given query to formulate better responses."""
    docs = vector_store.similarity_search(query, k=top_k)
    return " ".join([doc.page_content for doc in docs])

##########################################################################################################################
#conversation history to JSONL
def save_conversation_to_jsonl(conversation_history, conversation_id, file_name="negotiation_chats_selfplay.jsonl"):
    try:
        # Filter the conversation history to include only Employee and HR Manager roles
        filtered_conversation = [
            {"role": entry.get("role"), "content": entry.get("content")}
            for entry in conversation_history
            if entry.get("role") != "system"]

        #Negotiation entry
        negotiation_entry = {
            "ConversationID": conversation_id,
            "Negotiation": filtered_conversation}
        
        json_string = json.dumps(negotiation_entry, ensure_ascii=False, indent=None)

        #Append to file
        with open(file_name, mode='a', encoding='utf-8') as jsonlfile:
            jsonlfile.write(json_string + '\n')  # Ensure newline after each JSON object

        print(f"Successfully saved conversation with ConversationID {conversation_id} to {file_name}")
    except Exception as e:
        print(f"Error saving conversation with ConversationID {conversation_id}: {e}")


def save_metrics_to_csv(metrics, conversation_id, file_name="self_play_metrics.csv"):
    metric_fieldnames = [
        "ConversationID", 
        "Timestamp", 
        "Model",  
        "Personality",  
        "Initial Salary",  
        "Final Salary",  
        "Salary Percent Change",  
        "BLEU",  
        "ROUGE", 
        "BERTScore",  
        "Cosine Similarity",  
        "METEOR",
        "Avg Sentiment Score", 
        "Avg Response Length",  
        "Avg Relevance Score",  
        "Avg Coherence Score",  
        "Avg Combined GEval Score", 
        "MAUDE",  
        "Total Tokens",  
        "Avg Latency", 
        "Tokens per Second", 
        "Total Cost",  
        "Summary"] 
    
    #Add ConversationID to metrics
    metrics["ConversationID"] = conversation_id
    #take off unnecessary fields
    metrics = {key: metrics[key] for key in metric_fieldnames}
    #Ensure metrics contain all necessary keys
    for field in metric_fieldnames:
        if field not in metrics:
            metrics[field] = None  #Fill missing fields with None
            
    #Save metrics to the CSV file
    file_exists = os.path.isfile(file_name)
    with open(file_name, mode='a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=metric_fieldnames)
        if not file_exists:
            writer.writeheader()  #Write header only once
        writer.writerow(metrics)  #Write the metrics row
    
    print(f"Metrics saved to {file_name} for ConversationID {conversation_id}")

def save_scenario_to_csv(scenario, conversation_id, file_name="negotiation_scenarios.csv"):
    #CSV columns
    scenario_fieldnames = ["ConversationID", "job_role", "employee_personality", "hr_manager_personality", "employee_motivation", "manager_constraint", "past_achievement"]
    #Add ConversationID to the scenario dictionary
    scenario["ConversationID"] = conversation_id

    #Take off unnecessary fields and ensure all required fields are present
    scenario = {key: scenario[key] for key in scenario_fieldnames}
    for field in scenario_fieldnames:
        if field not in scenario:
            scenario[field] = None  #Fill missing fields with None

    # Save the scenario to the CSV file
    file_exists = os.path.isfile(file_name)
    with open(file_name, mode='a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=scenario_fieldnames)
        if not file_exists:
            writer.writeheader()  #Write header only once
        writer.writerow(scenario)  #Write the scenario row
    
    print(f"Successfully saved scenario for ConversationID {conversation_id} to {file_name}")

##########################################################################################################################
vader_analyzer = SentimentIntensityAnalyzer()
cosine_similarity_model = SentenceTransformer('stsb-roberta-large')

def cosine_similarity(reference, prediction):
    embedding1 = cosine_similarity_model.encode(reference, convert_to_tensor=True)
    embedding2 = cosine_similarity_model.encode(prediction, convert_to_tensor=True)
    similarity_score = util.pytorch_cos_sim(embedding1, embedding2)
    return similarity_score.item()
    
def corpus_bleu_eq(references, predictions):
    tokenized_references = [[word_tokenize(ref)] for ref in references]
    tokenized_predictions = [word_tokenize(pred) for pred in predictions]
    return corpus_bleu(tokenized_references, tokenized_predictions, smoothing_function=SmoothingFunction().method1)

def rouge_eq(reference, prediction):
    rouge = Rouge()
    scores = rouge.get_scores(prediction, reference)
    return scores[0]['rouge-l']['f']

def meteor_eq(reference, prediction):
    tokenizer = TreebankWordTokenizer()
    tokenized_reference = tokenizer.tokenize(reference)
    tokenized_prediction = tokenizer.tokenize(prediction)
    return meteor_score([tokenized_reference], tokenized_prediction)

def bertscore_metric(reference, prediction):
    P, R, F1 = bert_score([prediction], [reference], lang="en", verbose=False)
    return F1.mean().item()

def sentiment_score_def(text):
    """compound score: which ranges from -1 (negative) to +1 (positive)."""
    sentiment = vader_analyzer.polarity_scores(text)
    return sentiment['compound']

def response_length_def(text):
    """calc number of words in the text response."""
    return len(text.split())

#MAUDE
def maude_def(context, response, model="gpt-4"):
    """
    MAUDE score using GPT-4's chat-completions endpoint.
    """
    prompt = f"""Evaluate the following response in the context of the given conversation:
- Context: {context}
- Response: {response}

Rate the response from 0 to 1 based on the following criteria:
1. Coherence: How well the response logically follows from the context.
2. Relevance: How directly the response addresses the context.

Only provide the score as a numeric value between 0 and 1, with no additional explanation or text."""
    try:
        completion = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an evaluation assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0,
            max_tokens=10)
        #Extract the score from the response
        text_response = completion.choices[0].message['content'].strip()
        print(f"MAUDE Raw Response: {text_response}")
        if re.match(r'^\d+(\.\d+)?$', text_response):
            score = float(text_response)
        else:
            score = 0.0  #Handle invalid responses
        return score
    except Exception as e:
        print(f"Error in MAUDE scoring: {e}")
        return 0.0

#GEval metrics for relevance and coherence
def geval_def(context, response, criteria, model="gpt-4o"):
    """
    GE-val score for a given criteria (relevance or coherence).
    """
    prompt = f"""
    You are an evaluation assistant tasked with scoring a response based on the given criteria.
    
    Evaluation Task:
    - Criteria: {criteria}
    - Context: {context if context else 'N/A'}
    - Response: {response}
    
    Scoring Instructions:
    - Relevance: Score based on how directly the response addresses the context and aligns with a salary negotiation
        - A score of 0.9–1.0: The response directly addresses the main points in the context and is highly relevant
        - A score of 0.6–0.8: The response is partially relevant but may include unrelated or tangential content
        - A score of 0.0–0.5: The response does not address the context or is entirely irrelevant
    
    - Coherence: Score based on how logically structured and clear the response is for a salary negotiation
        - A score of 0.9–1.0: The response is well-structured, logical, and easy to understand.
        - A score of 0.6–0.8: The response is somewhat clear but may contain minor logical inconsistencies
        - A score of 0.0–0.5: The response is poorly structured, lacks logic, or is difficult to understand
    
    **Instructions**:
    1. Provide a numeric score between 0.0 and 1.0 based on the criteria
    2. Use only two decimal place in your score (e.g., 0.80, 0.90)
    3. Do not include any explanation or additional text in your response; provide only the score
    """
    try:
        completion = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an evaluation assistant."},
                {"role": "user", "content": prompt}],
            temperature=0.0,
            max_tokens=10)
        #Extract the score from the response
        text_response = completion.choices[0].message['content'].strip()
        print(f"GEval ({criteria}) Raw Response: {text_response}")
        if re.match(r'^\d+(\.\d+)?$', text_response):
            score = float(text_response)
        else:
            score = 0.0  #Handle invalid responses
        return score
    except Exception as e:
        print(f"Error in GEval scoring ({criteria}): {e}")
        return 0.0

#####################################################################################
#Reference text for BLEU, ROUGE, METEOR, BERTScore
reference_data = pd.read_csv("negotiation_references_combined.csv")

#Filter references by role
employee_references = reference_data[reference_data['role'] == 'Employee']['text'].tolist()
hr_manager_references = reference_data[reference_data['role'] == 'HR Manager']['text'].tolist()

#######################################################################################################
#Function to evaluate a single model 
def evaluate_individual_model(
    conversation_history, model_role, references, model_name, run_number, 
    current_salary, final_salary, model_personality, total_tokens_used, prompt_tokens,  
        completion_tokens, agreement_status):
    #Initialize metric containers
    bleu_scores, rouge_scores, meteor_scores, bert_scores, cosine_similarity_scores = [], [], [], [], []
    maude_scores = []
    relevance_scores, coherence_scores, combined_geval_scores = [], [], []
    sentiment_scores, response_lengths = [], []
    latencies = []
    total_time = 0  #Initialize total_time here
    total_tokens = 0

    #Summary of negotiation outcome
    summary = agreement_status

    #Filter responses for the given model role
    assistant_responses = [msg for msg in conversation_history if msg["role"] == model_role]

    #Process each response for metrics
    for i, assistant_response in enumerate(assistant_responses):
        prediction = assistant_response.get("content", "")
        context = " ".join([msg.get("content", "") for msg in conversation_history[:i]])

        #Calculate similarity metrics if reference exists
        if i < len(references):
            reference = references[i]
            bleu_scores.append(corpus_bleu_eq([reference], [prediction]))
            rouge_scores.append(rouge_eq(reference, prediction))
            meteor_scores.append(meteor_eq(reference, prediction))
            bert_scores.append(bertscore_metric(reference, prediction))
            cosine_similarity_scores.append(cosine_similarity(reference, prediction))

        #Calculate MAUDE score
        maude_scores.append(maude_def(context, prediction, model="gpt-4o"))

        #Calculate GEval metrics
        relevance_scores.append(geval_def(context, prediction, criteria="Relevance", model="gpt-4o"))
        coherence_scores.append(geval_def("", prediction, criteria="Coherence", model="gpt-4o"))
        combined_geval_scores.append(round(0.5 * relevance_scores[-1] + 0.5 * coherence_scores[-1], 3))

        #Sentiment and response length
        sentiment_scores.append(sentiment_score_def(prediction))
        response_lengths.append(response_length_def(prediction))

        #Track latency and token usage
        latency = assistant_response.get("latency", 0)
        latencies.append(latency)
        total_tokens += assistant_response.get("token_count", 0)
        total_time += latency
        usage = assistant_response.get('usage', {})
        prompt_tokens += usage.get('prompt_tokens', 0)
        completion_tokens += usage.get('completion_tokens', 0)

    #Ensure tokens per second is calculated after processing all responses
    tokens_per_second = total_tokens / total_time if total_time > 0 else 0

    #Calculate total cost based on token usage 
    input_cost = (prompt_tokens / 1000000) * 2.50  #$2.50 per 1M input tokens
    output_cost = (completion_tokens / 1000000) * 10.00  #$10.00 per 1M output tokens
    total_cost = input_cost + output_cost

    #Calculate salary percent change
    salary_percent_change = ((final_salary - current_salary) / current_salary) * 100
    if salary_percent_change == 0:
        print(f"No change in salary: Initial = {current_salary}, Final = {final_salary}")
    else:
        print(f"Salary change: {salary_percent_change}%")

    #Create metrics dictionary
    metrics_dictionary = {
        "BLEU": np.mean(bleu_scores) if bleu_scores else 0,
        "ROUGE": np.mean(rouge_scores) if rouge_scores else 0,
        "METEOR": np.mean(meteor_scores) if meteor_scores else 0,
        "BERTScore": np.mean(bert_scores) if bert_scores else 0,
        "Cosine Similarity": np.mean(cosine_similarity_scores) if cosine_similarity_scores else 0,
        "MAUDE": np.mean(maude_scores) if maude_scores else 0,
        "Avg Relevance Score": np.mean(relevance_scores) if relevance_scores else 0,
        "Avg Coherence Score": np.mean(coherence_scores) if coherence_scores else 0,
        "Avg Combined GEval Score": np.mean(combined_geval_scores) if combined_geval_scores else 0,
        "Avg Sentiment Score": np.mean(sentiment_scores) if sentiment_scores else 0,
        "Avg Response Length": np.mean(response_lengths) if response_lengths else 0,
        "Avg Latency": np.mean(latencies) if latencies else 0,
        "Tokens per Second": tokens_per_second,
        "Total Tokens": total_tokens,
        "Total Cost": total_cost,
        "Salary Percent Change": salary_percent_change,
        "Initial Salary": current_salary,
        "Final Salary": final_salary,
        "Summary": summary,
        "Timestamp": datetime.datetime.now().isoformat(),
        "Model": model_name,
        "Personality": model_personality}

    return metrics_dictionary

###########################################################################################################################
# Function intended to pull salary from responses to log 
def extract_salary(response):
    """
    Extract the salary from a response with enhanced fallback strategies.
    """
    #Keywords that indicate salary
    salary_keywords = [
        "propose", "offer", "current salary", "desired salary",
        "counteroffer", "final offer", "salary of", "adjustment to",
        "compensation", "increase to", "base salary", "package"]

    #Match dollar amounts in various formats
    matches = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', response)  #Match "$123,456.78"
    print(f"Extracted Matches: {matches}")  #Debugging statement for matches

    #If matches found, proceed to validate proximity to keywords
    if matches:
        for match in matches:
            #Clean and convert the matched dollar amount
            salary_amount = float(match.replace(",", "").replace("$", ""))

            #Check if the matched salary is near a keyword
            for keyword in salary_keywords:
                pattern = rf"{keyword}.*?{match}|{match}.*?{keyword}"  #Check proximity
                if re.search(pattern, response, re.IGNORECASE):
                    print(f"Extracted salary: {salary_amount} with keyword '{keyword}'")
                    return salary_amount  # Return the first valid match

        #Fallback: If no keywords are matched, assume the first dollar amount is a salary
        print(f"No keywords matched. Using first extracted match as fallback: {matches[0]}")
        return float(matches[0].replace(",", "").replace("$", ""))

    #Final fallback: If no matches, log and return None
    print(f"No valid salary found in response: {response}")
    return None


#Function intended to remove extraneous words and phrases that may be inadvertently printed 
#this helps remove generated text issues we have seen constantly
def clean_response(response):
    #Remove or limit "Thank you" or similar phrases
    response = re.sub(r"(Thank you|I appreciate|truly).+?\.", "", response, flags=re.IGNORECASE).strip()
    #Remove placeholders like "[Employee's Name]"
    response = re.sub(r"\[.*?\]", "", response).strip()
    #Remove leading role identifiers like "HR Manager:" or "Employee:" or "Dear ..."
    response = re.sub(r"^(HR Manager:|Employee:|Dear\s.+?,)", "", response, flags=re.IGNORECASE).strip()
    #Remove excessive whitespace
    response = re.sub(r"\s+", " ", response).strip()
    #Remove bullet points, markdown-like formatting, and excessive symbols
    response = re.sub(r"(\*\*|--|\s*-)", "", response).strip()
    #Remove numbers or irrelevant prefixes (e.g., "94.")
    response = re.sub(r"^\d+\.?", "", response).strip()
    #Split into sentences and remove duplicates while preserving order
    sentences = [s.strip() for s in response.split(".") if s.strip()]
    seen = set()
    unique_sentences = []
    for s in sentences:
        if s not in seen and s != "":
            unique_sentences.append(s)
            seen.add(s)

    #Rejoin sentences
    response = ". ".join(unique_sentences).strip()

    #Remove repeated leading "We" or "I" (rare edge case)
    response = re.sub(r"^(We\s+We|I\s+I)\s+", "", response, flags=re.IGNORECASE)

    #Ensure proper capitalization of the first letter
    if response and not response[0].isupper():
        response = response[0].upper() + response[1:]

    return response



def evaluate_negotiation_outcome(conversation_history, model="gpt-4o"):
    """
    Determines whether the negotiation ended in an agreement or no agreement
    based on the last four turns of the conversation using GPT-4o.
    """
    #Extract the last four turns
    last_turns = conversation_history[-4:]
    last_turns_text = "\n".join(
        f"{entry['role']}: {entry['content']}" for entry in last_turns)

    #Create the evaluation prompt
    evaluation_prompt = f"""
    Below is the conversation history of a salary negotiation. 
    Review the final four turns and determine whether the negotiation ended in an agreement or no agreement. 
    Your response must include one of the following:
    - "Agreement" if both parties agreed on a salary or terms.
    - "No Agreement" if the negotiation ended without mutual consent.

    Final Four Turns:
    {last_turns_text}

    Outcome:
    """

    #Call the GPT-4o model to evaluate the outcome
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an evaluation assistant."},
                {"role": "user", "content": evaluation_prompt.strip()}
            ],
            max_tokens=50,
            temperature=0.0)
        outcome = response.choices[0].message['content'].strip()
        print(f"Evaluation Bot Outcome: {outcome}")
        return outcome
    except Exception as e:
        print(f"Error evaluating negotiation outcome: {e}")
        return "Error"



###########################################################################################################################

def run_negotiation(
    model_employee,
    model_manager,
    scenario,
    current_salary,
    desired_salary,
    max_turns,
    sim_token_limit):
    """
    In this function: we implement a salary negotiation enabling proper role alternation, enforcing a minimum number of turns,
    and ensuring the HR Manager always delivers the closing statement.
    """
    conversation_history = []
    total_tokens_used = 0
    current_offer = current_salary
    highest_counteroffer = current_salary
    total_time = 0
    prompt_tokens = 0
    completion_tokens = 0

    starting_role = random.choice(["Employee", "HR Manager"])
    current_role = starting_role
    roles = {"Employee": model_employee, "HR Manager": model_manager}

    # Introductory message
    # IF-Else to write different starting messages for whoever begins negotiation talks 
    if starting_role == "Employee":
        intro_message = f"""
        Thank you for taking the time to meet today. I wanted to discuss my compensation and explore how we can align 
        it more closely with my contributions and the market standards. My current salary is ${current_salary}, and 
        I believe there is room for adjustment to better reflect my role.
        """
    else:
        intro_message = f"""
        Thank you for meeting with us today to discuss your compensation. 
        We value your work at the company and aim to have a discussion.
        Currently, your salary is ${current_salary}, and we are open to discussing an adjustment.
        """

    conversation_history.append({
        "role": starting_role,
        "content": clean_response(intro_message.strip()),
        "latency": 0,
        "token_count": 0
    })
    
    # Set this variable as the checker to if there was an agreement made
    # Should there be an agreement, this variable will change to "Agreement"
    agreement_status = "No Agreement"
    # To prevent premature terminations in negotiations 
    # Set a minimum of half the max turns in order for the negotiation to play out further
    minimum_turns = max_turns // 2

    # Triggers the negotiation loop to begin
    for turn in range(max_turns):
        current_role = "HR Manager" if current_role == "Employee" else "Employee"
        model = roles[current_role]
        tone = scenario["employee_personality"] if current_role == "Employee" else scenario["hr_manager_personality"]

        memory_context = "\n".join(f"{entry['role']}: {entry['content']}" for entry in conversation_history[-4:])
        prompt = f"""
        The employee's current salary is ${current_salary}.
        The company's current offer is ${current_offer}.
        The employee's desired salary is ${desired_salary}.
        The highest counteroffer so far is ${highest_counteroffer}.
        
        Context:
        {memory_context}

        Instructions:
        - Respond as the {current_role} in a {tone} tone.
        - Avoid repeating points, emojis, or unnecessary text.
        - Suggest creative solutions (e.g., bonuses, phased raises, non-monetary benefits).
        - Be concise and professional.
        """

        try:
            start_time = time.time()
            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role": "system", "content": f"You are the {current_role} in this negotiation."},
                    {"role": "user", "content": prompt.strip()}
                ],
                max_tokens=200,
                temperature=0.7)
            latency = time.time() - start_time
            response_content = clean_response(response.choices[0].message['content'].strip())

            conversation_history.append({
                "role": current_role,
                "content": response_content,
                "latency": latency,
                "token_count": response['usage']['total_tokens']
            })

            print(f"Turn {turn + 1} ({current_role}):\n{response_content}")

            # Evaluate agreement status after half the turns
            if turn >= minimum_turns:
                current_outcome = evaluate_negotiation_outcome(conversation_history)
                if current_outcome == "Agreement":
                    print("Agreement reached early. Ending negotiation.")
                    agreement_status = "Agreement"
                    break

        except Exception as e:
            print(f"Error generating response for {current_role}: {e}")
            break

    # For Fine Tuning purposes, this if statement is needed to guarantee that the HR Manager gets the last word
    # Closing message designed to for either situations of an agreement/no agreement. 
    if conversation_history[-1]['role'] != "HR Manager":
        closing_message = (
            "We are pleased to finalize our agreement, reflecting both our appreciation for your contributions and our mutual goals. "
            "We look forward to your continued success and growth with us." 
            if agreement_status == "Agreement" else 
            "While we could not finalize an agreement today, we deeply value your contributions. Thank you for engaging thoughtfully, "
            "and we look forward to reconnecting in the future."
        )

        hr_manager_response = openai.ChatCompletion.create(
            model=model_manager,
            messages=[
                {"role": "system", "content": "You are the HR Manager in this negotiation."},
                {"role": "user", "content": closing_message.strip()}
            ],
            max_tokens=150,
            temperature=0.5
        )
        final_hr_manager_response = clean_response(hr_manager_response.choices[0].message['content'].strip())
        conversation_history.append({
            "role": "HR Manager",
            "content": final_hr_manager_response,
            "latency": 0,
            "token_count": hr_manager_response['usage']['total_tokens']
        })
        print(f"Final HR Manager Turn:\n{final_hr_manager_response}")

    return conversation_history, current_offer, total_tokens_used, prompt_tokens, completion_tokens, agreement_status




#Negotiation Loop
for run in range(num_negotiations):
    scenario, current_salary, desired_salary = generate_random_scenario()
    conversation_id = generate_conversation_id()

    #Updated to unpack all returned values
    conversation, current_offer, total_tokens_used, prompt_tokens, completion_tokens, agreement_reached = run_negotiation(
        model_employee=model_employee,
        model_manager=model_manager,
        scenario=scenario,
        current_salary=current_salary,
        desired_salary=desired_salary,
        max_turns=max_turns,
        sim_token_limit=sim_token_limit)


    #Save negotiation details
    save_conversation_to_jsonl(conversation, conversation_id, file_name="negotiation_chats_selfplay_base.jsonl")
    final_salary = current_offer if agreement_reached == "Agreement" else current_salary
    
    model_1_metrics = evaluate_individual_model(
        conversation_history=conversation,
        model_role="Employee",
        references=employee_references,
        model_name="Model 1",
        run_number=run,
        current_salary=current_salary,
        final_salary = current_offer if agreement_reached == "Agreement" else current_salary,
        model_personality=scenario["employee_personality"],
        total_tokens_used=total_tokens_used,
        prompt_tokens=prompt_tokens,  
        completion_tokens=completion_tokens,  
        agreement_status=agreement_reached)
    
    model_2_metrics = evaluate_individual_model(
        conversation_history=conversation,
        model_role="HR Manager",
        references=hr_manager_references,
        model_name="Model 2",
        run_number=run,
        current_salary=current_salary,
        final_salary= current_offer if agreement_reached == "Agreement" else current_salary,  
        model_personality=scenario["hr_manager_personality"],
        total_tokens_used=total_tokens_used,
        prompt_tokens=prompt_tokens,  
        completion_tokens=completion_tokens,  
        agreement_status=agreement_reached)


    #Save metrics for the run
    save_metrics_to_csv(model_1_metrics, conversation_id, file_name="model_1_metrics_base.csv")
    save_metrics_to_csv(model_2_metrics, conversation_id, file_name="model_2_metrics_base.csv")
    save_scenario_to_csv(scenario, conversation_id, file_name="negotiation_scenarios_base.csv")

    print(f"Run {run} Summary:")
    print(f"Initial Salary: ${current_salary}")
    print(f"Desired Salary: ${desired_salary}")
    print(f"Final Offer: ${current_offer}")

Processed file: main_negotiations.pdf with 93 chunks.
Processed file: business-english-negotiations-jigsaw-dialogues-and-useful-phrases.pdf with 6 chunks.
Processed file: Transcript Example.pdf with 9 chunks.
Processed file: 62 Business English Negotiation Phrases _ FluentU.pdf with 23 chunks.
Turn 1 (Employee):
I believe we can find a path forward that aligns with both our goals. While my desired salary is $121,900, I understand the need for flexibility. Perhaps we could explore a phased salary increase, starting with an initial adjustment and a review in six months based on performance metrics. Additionally, a performancebased bonus structure or enhanced professional development opportunities could bridge the gap. Let's work together to create a mutually beneficial arrangement
Turn 2 (HR Manager):
While the current budget limits us to maintaining the $105,100 salary, I am open to discussing a phased approach. We can consider an initial raise to $109,000, with a performance review in 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.9
GEval (Relevance) Raw Response: 0.80
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95
No change in salary: Initial = 105100, Final = 105100


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95
No change in salary: Initial = 105100, Final = 105100
Metrics saved to model_1_metrics_base.csv for ConversationID c82a87c2-c68b-459b-833e-6f591b0e27c3
Metrics saved to model_2_metrics_base.csv for ConversationID c82a87c2-c68b-459b-833e-6f591b0e27c3
Successfully saved scenario for ConversationID c82a87c2-c68b-459b-833e-6f591b0e27c3 to negotiation_scenarios_base.csv
Run 0 Summary:
Initial Salary: $105100
Desired Salary: $121900.0
Final Offer: $105100
