In [1]:
import uuid # for conversation id
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer, util
from datetime import datetime
import openai
import random
import csv
import datetime
import time
import numpy as np
import pandas as pd
import os
import json
import re


#RAG 
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
#Incorporating langchain for easier integration and embedding document chunks into FAISS vector
#Rationale More Efficiency 
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings


#Metrics
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge import Rouge 
from bert_score import score as bert_score
from nltk.translate.meteor_score import meteor_score 
from nltk.tokenize import word_tokenize  #Import for tokenizing
from nltk.tokenize import TreebankWordTokenizer
from rouge_score import rouge_scorer
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from transformers import pipeline
import gradio as gr

  from tqdm.autonotebook import tqdm, trange





[nltk_data] Downloading package punkt to C:\Users\Claire
[nltk_data]     Personal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Claire
[nltk_data]     Personal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
#Prompt to enter OpenAI API key
openai.api_key = #############################
#Set Up Configuration
#######################################################################################################################
#Model settings
model_employee = # 
model_manager = #

#Negotiation settings
num_negotiations = 10
max_turns = 10   #Maximum turns per negotiation
sim_token_limit = 5000     #Total token limit for a single negotiation

#######################################################################################################################
#Scenario generator to help generate different scenarios
def generate_random_scenario():
    job_roles = [
        "Data Scientist", "Software Developer", "Data Engineer", "Machine Learning Engineer",
        "Cloud Architect", "Cybersecurity Analyst", "DevOps Engineer", "Frontend Developer",
        "Backend Developer", "Full Stack Developer", "Data Analyst", "Business Analyst", "Financial Analyst",
        "Market Research Analyst", "Risk Analyst", "Quantitative Analyst", "Actuary", "BI (Business Intelligence) Developer",
        "Supply Chain Analyst", "Quality Assurance Analyst", "Solutions Architect","Management Consultant",
        "Strategy Consultant", "IT Consultant", "Financial Consultant", "Personal Banker"
    ]

    personalities = [
        "Assertive and Direct", "Empathetic and Understanding", "Optimistic and Cheerful", 
        "Pessimistic and Skeptical", "Logical and Analytical", "Data-Driven and Objective", 
        "Charismatic and Persuasive", "Cooperative and Team-Oriented", "Independent and Self-Assured",
        "Reserved and Introverted", "Outspoken and Confident", "Ambitious and Goal-Oriented",
        "Flexible and Open-Minded", "Competitive and Driven", "Thoughtful and Reflective",
        "Emotionally Reactive", "Calm and Composed", "Decisive and Pragmatic",
        "Detail-Oriented and Methodical", "Intuitive and Visionary", "Resourceful and Adaptable", 
        "Risk-Averse and Conservative", "Humorous and Lighthearted", "Respectful and Polite", 
        "Innovative and Forward-Thinking"
    ]

    employee_motivations = [
        "wants a raise to better support family due to increased living costs",
        "feels underpaid for the responsibilities taken on",
        "wants a raise in line with industry standards",
        "is planning to buy a house and needs higher income for mortgage",
        "believes they have grown in skills and wants to be recognized",
        "is seeking a salary increase after a successful project delivery",
        "has been receiving competitive offers from other companies",
        "is expecting a promotion and a corresponding raise",
        "has taken on additional responsibilities without a raise",
        "attained a higher education degree and is looking for a raise",
        "is aiming to save for a child's education", 
        "has been with the company for more than 5 years",
        "experiencing burnout and sees that a raise could alleviate this feeling",
        "doesn't have a performance review coming up soon or a raise schedule in place",
        "the company has reported strong earnings in its recent financial reports",
        "recently taken on more responsibility or started a new position",
        "has worked mainly at the office and does not have the option to work from home despite living long distance from work",
        "seeking a raise otherwise will need to relocate due to a better offer from a company in another city",
        "your managers frequently rely on you to pick up work from other team members",
        "received a high-paying offer from another company but doesn't want to leave your current role"
    ]

    manager_constraints = [
        "the company is facing budget constraints due to recent cost-cutting measures",
        "the company has posted record profits this quarter",
        "the department budget is tight, but employee retention is crucial",
        "management is prioritizing retention for critical roles",
        "the company has recently implemented a hiring freeze",
        "the company is on the brink of filing for bankruptcy",
        "the company has been acquired by another company in which new management will determine new compensation packages",
        "there is uncertainty in the market, making budgets more restrictive",
        "the company is launching a major initiative and needs to retain talent",
        "other employees in similar roles have not received raises recently",
        "HR policies require a thorough review before approving raises",
        "the company is undergoing an internal restructuring process"
    ]

    past_achievements = [
        "led three major projects that increased department productivity by 20%",
        "received an award for excellent customer feedback",
        "trained new hires and significantly reduced onboarding time",
        "automated a process that saved the company $50,000 annually",
        "initiated a successful cross-department collaboration project",
        "developed a tool that reduced report generation time by 50%",
        "solved a critical issue that prevented project delays worth millions",
        "mentored junior employees, improving their performance significantly",
        "secured a key client that generated $1 million in revenue",
        "proposed a new strategy that increased team efficiency"
    ]

    scenario = {
        "job_role": random.choice(job_roles),
        "employee_personality": random.choice(personalities),
        "hr_manager_personality": random.choice(personalities),
        "employee_motivation": random.choice(employee_motivations),
        "manager_constraint": random.choice(manager_constraints),
        "past_achievement": random.choice(past_achievements)}

    current_salary = round(random.randint(45000, 175000),-2)
    desired_percentage = random.uniform(0.05, 0.25) 
    desired_salary = round(current_salary + (current_salary * desired_percentage), -2)
    adjusted_desired_salary = desired_salary 

    return scenario, current_salary, desired_salary

#######################################################################################################################

#Function to track token usage
def token_usage_tracker(response, total_tokens_used, max_tokens):
    print(f"Response structure: {response}")
    #retrieve token usage
    used_tokens = response.get('usage', {}).get('total_tokens', 0)
    total_tokens_used += used_tokens

    print(f"Total tokens used in this turn: {used_tokens}. Total tokens so far: {total_tokens_used}/{max_tokens}")

    if total_tokens_used >= max_tokens:
        print("Maximum number of tokens for negotiation used. Must end conversation.")
    
    return total_tokens_used


#Conversation history
def generate_conversation_id():
    return str(uuid.uuid4())


#######################################################################################################################
#RAG Section
embedding_model = SentenceTransformerEmbeddings(model_name='all-MiniLM-L6-v2') 
rag_document_location = "RAG"

#Used to process PDFs, split into chunks
def lang_chain_pdf_puller(rag_document_location):
    all_chunks = []
    file_list = []

    #Raises an error should the folder not exist. 
    if not os.path.exists(rag_document_location):
        raise FileNotFoundError(f"The folder '{rag_document_location}' does not exist.")
    
    #Reduce long text to avoid maximizing big chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
    for filename in os.listdir(rag_document_location):
        if filename.endswith(".pdf"):
            file_path = os.path.join(rag_document_location, filename)
            try:
                loader = PyPDFLoader(file_path)
                docs = loader.load()
                # Split documents into chunks
                chunks = text_splitter.split_documents(docs)
                # Extend Chunks
                all_chunks.extend(chunks)
                file_list.append(filename)
                print(f"Processed file: {filename} with {len(chunks)} chunks.")
            except Exception as e:
                print(f"Error processing {filename}: {e}")
    return all_chunks, file_list

all_chunks, file_list = lang_chain_pdf_puller(rag_document_location)
texts = [chunk.page_content for chunk in all_chunks]
vector_store = FAISS.from_texts(texts, embedding_model)

#Sifted through folder
def search_vector_db(query, vector_store, top_k=3):
    """Retrieve relevant information for given query to formulate better responses."""
    docs = vector_store.similarity_search(query, k=top_k)
    return " ".join([doc.page_content for doc in docs])

##########################################################################################################################
#conversation history to JSONL
def save_conversation_to_jsonl(conversation_history, conversation_id, file_name="negotiation_chats_selfplay.jsonl"):
    try:
        # Filter the conversation history to include only Employee and HR Manager roles
        filtered_conversation = [
            {"role": entry.get("role"), "content": entry.get("content")}
            for entry in conversation_history
            if entry.get("role") != "system"]

        #Negotiation entry
        negotiation_entry = {
            "ConversationID": conversation_id,
            "Negotiation": filtered_conversation}
        
        json_string = json.dumps(negotiation_entry, ensure_ascii=False, indent=None)

        #Append to file
        with open(file_name, mode='a', encoding='utf-8') as jsonlfile:
            jsonlfile.write(json_string + '\n')  # Ensure newline after each JSON object

        print(f"Successfully saved conversation with ConversationID {conversation_id} to {file_name}")
    except Exception as e:
        print(f"Error saving conversation with ConversationID {conversation_id}: {e}")


def save_metrics_to_csv(metrics, conversation_id, file_name="self_play_metrics.csv"):
    metric_fieldnames = [
        "ConversationID", 
        "Timestamp", 
        "Model",  
        "Personality",  
        "Initial Salary",  
        "Final Salary",  
        "Salary Percent Change",  
        "BLEU",  
        "ROUGE", 
        "BERTScore",  
        "Cosine Similarity",  
        "METEOR",
        "Avg Sentiment Score", 
        "Avg Response Length",  
        "Avg Relevance Score",  
        "Avg Coherence Score",  
        "Avg Combined GEval Score", 
        "MAUDE",  
        "Total Tokens",  
        "Avg Latency", 
        "Tokens per Second", 
        "Total Cost",  
        "Summary"] 
    
    #Add ConversationID to metrics
    metrics["ConversationID"] = conversation_id
    #take off unnecessary fields
    metrics = {key: metrics[key] for key in metric_fieldnames}
    #Ensure metrics contain all necessary keys
    for field in metric_fieldnames:
        if field not in metrics:
            metrics[field] = None  #Fill missing fields with None
            
    #Save metrics to the CSV file
    file_exists = os.path.isfile(file_name)
    with open(file_name, mode='a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=metric_fieldnames)
        if not file_exists:
            writer.writeheader()  #Write header only once
        writer.writerow(metrics)  #Write the metrics row
    
    print(f"Metrics saved to {file_name} for ConversationID {conversation_id}")

def save_scenario_to_csv(scenario, conversation_id, file_name="negotiation_scenarios.csv"):
    #CSV columns
    scenario_fieldnames = ["ConversationID", "job_role", "employee_personality", "hr_manager_personality", "employee_motivation", "manager_constraint", "past_achievement"]
    #Add ConversationID to the scenario dictionary
    scenario["ConversationID"] = conversation_id

    #Take off unnecessary fields and ensure all required fields are present
    scenario = {key: scenario[key] for key in scenario_fieldnames}
    for field in scenario_fieldnames:
        if field not in scenario:
            scenario[field] = None  #Fill missing fields with None

    # Save the scenario to the CSV file
    file_exists = os.path.isfile(file_name)
    with open(file_name, mode='a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=scenario_fieldnames)
        if not file_exists:
            writer.writeheader()  #Write header only once
        writer.writerow(scenario)  #Write the scenario row
    
    print(f"Successfully saved scenario for ConversationID {conversation_id} to {file_name}")

##########################################################################################################################
vader_analyzer = SentimentIntensityAnalyzer()
cosine_similarity_model = SentenceTransformer('stsb-roberta-large')

def cosine_similarity(reference, prediction):
    embedding1 = cosine_similarity_model.encode(reference, convert_to_tensor=True)
    embedding2 = cosine_similarity_model.encode(prediction, convert_to_tensor=True)
    similarity_score = util.pytorch_cos_sim(embedding1, embedding2)
    return similarity_score.item()
    
def corpus_bleu_eq(references, predictions):
    tokenized_references = [[word_tokenize(ref)] for ref in references]
    tokenized_predictions = [word_tokenize(pred) for pred in predictions]
    return corpus_bleu(tokenized_references, tokenized_predictions, smoothing_function=SmoothingFunction().method1)

def rouge_eq(reference, prediction):
    rouge = Rouge()
    scores = rouge.get_scores(prediction, reference)
    return scores[0]['rouge-l']['f']

def meteor_eq(reference, prediction):
    tokenizer = TreebankWordTokenizer()
    tokenized_reference = tokenizer.tokenize(reference)
    tokenized_prediction = tokenizer.tokenize(prediction)
    return meteor_score([tokenized_reference], tokenized_prediction)

def bertscore_metric(reference, prediction):
    P, R, F1 = bert_score([prediction], [reference], lang="en", verbose=False)
    return F1.mean().item()

def sentiment_score_def(text):
    """compound score: which ranges from -1 (negative) to +1 (positive)."""
    sentiment = vader_analyzer.polarity_scores(text)
    return sentiment['compound']

def response_length_def(text):
    """calc number of words in the text response."""
    return len(text.split())

#MAUDE
def maude_def(context, response, model="gpt-4"):
    """
    MAUDE score using GPT-4's chat-completions endpoint.
    """
    prompt = f"""Evaluate the following response in the context of the given conversation:
- Context: {context}
- Response: {response}

Rate the response from 0 to 1 based on the following criteria:
1. Coherence: How well the response logically follows from the context.
2. Relevance: How directly the response addresses the context.

Only provide the score as a numeric value between 0 and 1, with no additional explanation or text."""
    try:
        completion = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an evaluation assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.0,
            max_tokens=10)
        #Extract the score from the response
        text_response = completion.choices[0].message['content'].strip()
        print(f"MAUDE Raw Response: {text_response}")
        if re.match(r'^\d+(\.\d+)?$', text_response):
            score = float(text_response)
        else:
            score = 0.0  #Handle invalid responses
        return score
    except Exception as e:
        print(f"Error in MAUDE scoring: {e}")
        return 0.0

#GEval metrics for relevance and coherence
def geval_def(context, response, criteria, model="gpt-4o"):
    """
    GE-val score for a given criteria (relevance or coherence).
    """
    prompt = f"""
    You are an evaluation assistant tasked with scoring a response based on the given criteria.
    
    Evaluation Task:
    - Criteria: {criteria}
    - Context: {context if context else 'N/A'}
    - Response: {response}
    
    Scoring Instructions:
    - Relevance: Score based on how directly the response addresses the context and aligns with a salary negotiation
        - A score of 0.9–1.0: The response directly addresses the main points in the context and is highly relevant
        - A score of 0.6–0.8: The response is partially relevant but may include unrelated or tangential content
        - A score of 0.0–0.5: The response does not address the context or is entirely irrelevant
    
    - Coherence: Score based on how logically structured and clear the response is for a salary negotiation
        - A score of 0.9–1.0: The response is well-structured, logical, and easy to understand.
        - A score of 0.6–0.8: The response is somewhat clear but may contain minor logical inconsistencies
        - A score of 0.0–0.5: The response is poorly structured, lacks logic, or is difficult to understand
    
    **Instructions**:
    1. Provide a numeric score between 0.0 and 1.0 based on the criteria
    2. Use only two decimal place in your score (e.g., 0.80, 0.90)
    3. Do not include any explanation or additional text in your response; provide only the score
    """
    try:
        completion = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an evaluation assistant."},
                {"role": "user", "content": prompt}],
            temperature=0.0,
            max_tokens=10)
        #Extract the score from the response
        text_response = completion.choices[0].message['content'].strip()
        print(f"GEval ({criteria}) Raw Response: {text_response}")
        if re.match(r'^\d+(\.\d+)?$', text_response):
            score = float(text_response)
        else:
            score = 0.0  #Handle invalid responses
        return score
    except Exception as e:
        print(f"Error in GEval scoring ({criteria}): {e}")
        return 0.0

#####################################################################################
#Reference text for BLEU, ROUGE, METEOR, BERTScore
reference_data = pd.read_csv("negotiation_references_combined.csv")

#Filter references by role
employee_references = reference_data[reference_data['role'] == 'Employee']['text'].tolist()
hr_manager_references = reference_data[reference_data['role'] == 'HR Manager']['text'].tolist()

#######################################################################################################
#Function to evaluate a single model 
def evaluate_individual_model(
    conversation_history, model_role, references, model_name, run_number, 
    current_salary, final_salary, model_personality, total_tokens_used, prompt_tokens,  
        completion_tokens, agreement_status):
    #Initialize metric containers
    bleu_scores, rouge_scores, meteor_scores, bert_scores, cosine_similarity_scores = [], [], [], [], []
    maude_scores = []
    relevance_scores, coherence_scores, combined_geval_scores = [], [], []
    sentiment_scores, response_lengths = [], []
    latencies = []
    total_time = 0  #Initialize total_time here
    total_tokens = 0

    #Summary of negotiation outcome
    summary = agreement_status

    #Filter responses for the given model role
    assistant_responses = [msg for msg in conversation_history if msg["role"] == model_role]

    #Process each response for metrics
    for i, assistant_response in enumerate(assistant_responses):
        prediction = assistant_response.get("content", "")
        context = " ".join([msg.get("content", "") for msg in conversation_history[:i]])

        #Calculate similarity metrics if reference exists
        if i < len(references):
            reference = references[i]
            bleu_scores.append(corpus_bleu_eq([reference], [prediction]))
            rouge_scores.append(rouge_eq(reference, prediction))
            meteor_scores.append(meteor_eq(reference, prediction))
            bert_scores.append(bertscore_metric(reference, prediction))
            cosine_similarity_scores.append(cosine_similarity(reference, prediction))

        #Calculate MAUDE score
        maude_scores.append(maude_def(context, prediction, model="gpt-4o"))

        #Calculate GEval metrics
        relevance_scores.append(geval_def(context, prediction, criteria="Relevance", model="gpt-4o"))
        coherence_scores.append(geval_def("", prediction, criteria="Coherence", model="gpt-4o"))
        combined_geval_scores.append(round(0.5 * relevance_scores[-1] + 0.5 * coherence_scores[-1], 3))

        #Sentiment and response length
        sentiment_scores.append(sentiment_score_def(prediction))
        response_lengths.append(response_length_def(prediction))

        #Track latency and token usage
        latency = assistant_response.get("latency", 0)
        latencies.append(latency)
        total_tokens += assistant_response.get("token_count", 0)
        total_time += latency
        usage = assistant_response.get('usage', {})
        prompt_tokens += usage.get('prompt_tokens', 0)
        completion_tokens += usage.get('completion_tokens', 0)

    #Ensure tokens per second is calculated after processing all responses
    tokens_per_second = total_tokens / total_time if total_time > 0 else 0

    #Calculate total cost based on token usage 
    input_cost = (prompt_tokens / 1000000) * 12.00  #$12 per 1M input tokens
    output_cost = (completion_tokens / 1000000) * 16.00  #$16 per 1M output tokens
    total_cost = input_cost + output_cost

    #Calculate salary percent change
    salary_percent_change = ((final_salary - current_salary) / current_salary) * 100
    if salary_percent_change == 0:
        print(f"No change in salary: Initial = {current_salary}, Final = {final_salary}")
    else:
        print(f"Salary change: {salary_percent_change}%")

    #Create metrics dictionary
    metrics_dictionary = {
        "BLEU": np.mean(bleu_scores) if bleu_scores else 0,
        "ROUGE": np.mean(rouge_scores) if rouge_scores else 0,
        "METEOR": np.mean(meteor_scores) if meteor_scores else 0,
        "BERTScore": np.mean(bert_scores) if bert_scores else 0,
        "Cosine Similarity": np.mean(cosine_similarity_scores) if cosine_similarity_scores else 0,
        "MAUDE": np.mean(maude_scores) if maude_scores else 0,
        "Avg Relevance Score": np.mean(relevance_scores) if relevance_scores else 0,
        "Avg Coherence Score": np.mean(coherence_scores) if coherence_scores else 0,
        "Avg Combined GEval Score": np.mean(combined_geval_scores) if combined_geval_scores else 0,
        "Avg Sentiment Score": np.mean(sentiment_scores) if sentiment_scores else 0,
        "Avg Response Length": np.mean(response_lengths) if response_lengths else 0,
        "Avg Latency": np.mean(latencies) if latencies else 0,
        "Tokens per Second": tokens_per_second,
        "Total Tokens": total_tokens,
        "Total Cost": total_cost,
        "Salary Percent Change": salary_percent_change,
        "Initial Salary": current_salary,
        "Final Salary": final_salary,
        "Summary": summary,
        "Timestamp": datetime.datetime.now().isoformat(),
        "Model": model_name,
        "Personality": model_personality}

    return metrics_dictionary

###########################################################################################################################
def extract_salary(response):
    """
    Extract the salary from a response with enhanced fallback strategies.
    """
    #Keywords that indicate salary
    salary_keywords = [
        "propose", "offer", "current salary", "desired salary",
        "counteroffer", "final offer", "salary of", "adjustment to",
        "compensation", "increase to", "base salary", "package"]

    #Match dollar amounts in various formats
    matches = re.findall(r'\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?', response)  #Match "$123,456.78"
    print(f"Extracted Matches: {matches}")  #Debugging statement for matches

    #If matches found, proceed to validate proximity to keywords
    if matches:
        for match in matches:
            #Clean and convert the matched dollar amount
            salary_amount = float(match.replace(",", "").replace("$", ""))

            #Check if the matched salary is near a keyword
            for keyword in salary_keywords:
                pattern = rf"{keyword}.*?{match}|{match}.*?{keyword}"  #Check proximity
                if re.search(pattern, response, re.IGNORECASE):
                    print(f"Extracted salary: {salary_amount} with keyword '{keyword}'")
                    return salary_amount  # Return the first valid match

        #Fallback: If no keywords are matched, assume the first dollar amount is a salary
        print(f"No keywords matched. Using first extracted match as fallback: {matches[0]}")
        return float(matches[0].replace(",", "").replace("$", ""))

    #Final fallback: If no matches, log and return None
    print(f"No valid salary found in response: {response}")
    return None


#Function intended to remove extraneous words and phrases that may be inadvertently printed 
#this helps remove gernated text issues we have seen constantly
def clean_response(response):
    #Remove or limit "Thank you" or similar phrases
    response = re.sub(r"(Thank you|I appreciate|truly).+?\.", "", response, flags=re.IGNORECASE).strip()
    #Remove placeholders like "[Employee's Name]"
    response = re.sub(r"\[.*?\]", "", response).strip()
    #Remove leading role identifiers like "HR Manager:" or "Employee:" or "Dear ..."
    response = re.sub(r"^(HR Manager:|Employee:|Dear\s.+?,)", "", response, flags=re.IGNORECASE).strip()
    #Remove excessive whitespace
    response = re.sub(r"\s+", " ", response).strip()
    #Remove bullet points, markdown-like formatting, and excessive symbols
    response = re.sub(r"(\*\*|--|\s*-)", "", response).strip()
    #Remove numbers or irrelevant prefixes (e.g., "94.")
    response = re.sub(r"^\d+\.?", "", response).strip()
    #Split into sentences and remove duplicates while preserving order
    sentences = [s.strip() for s in response.split(".") if s.strip()]
    seen = set()
    unique_sentences = []
    for s in sentences:
        if s not in seen and s != "":
            unique_sentences.append(s)
            seen.add(s)

    #Rejoin sentences
    response = ". ".join(unique_sentences).strip()

    #Remove repeated leading "We" or "I" (rare edge case)
    response = re.sub(r"^(We\s+We|I\s+I)\s+", "", response, flags=re.IGNORECASE)

    #Ensure proper capitalization of the first letter
    if response and not response[0].isupper():
        response = response[0].upper() + response[1:]

    return response



def evaluate_negotiation_outcome(conversation_history, model="gpt-4o"):
    """
    Determines whether the negotiation ended in an agreement or no agreement
    based on the last four turns of the conversation using GPT-4o.
    """
    #Extract the last four turns
    last_turns = conversation_history[-4:]
    last_turns_text = "\n".join(
        f"{entry['role']}: {entry['content']}" for entry in last_turns)

    #Create the evaluation prompt
    evaluation_prompt = f"""
    Below is the conversation history of a salary negotiation. 
    Review the final four turns and determine whether the negotiation ended in an agreement or no agreement. 
    Your response must include one of the following:
    - "Agreement" if both parties agreed on a salary or terms.
    - "No Agreement" if the negotiation ended without mutual consent.

    Final Four Turns:
    {last_turns_text}

    Outcome:
    """

    #Call the GPT-4o model to evaluate the outcome
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are an evaluation assistant."},
                {"role": "user", "content": evaluation_prompt.strip()}
            ],
            max_tokens=50,
            temperature=0.0)
        outcome = response.choices[0].message['content'].strip()
        print(f"Evaluation Bot Outcome: {outcome}")
        return outcome
    except Exception as e:
        print(f"Error evaluating negotiation outcome: {e}")
        return "Error"



###########################################################################################################################

def run_negotiation(
    model_employee,
    model_manager,
    scenario,
    current_salary,
    desired_salary,
    max_turns,
    sim_token_limit):
    """
    Simulates a salary negotiation where HR Manager always concludes the conversation.
    Ensures strict role alternation and avoids double turns.
    """
    conversation_history = []
    total_tokens_used = 0
    current_offer = current_salary
    highest_counteroffer = current_salary
    total_time = 0
    prompt_tokens = 0
    completion_tokens = 0

    starting_role = random.choice(["Employee", "HR Manager"])
    current_role = starting_role
    roles = {"Employee": model_employee, "HR Manager": model_manager}

    # Introductory message
    if starting_role == "Employee":
        intro_message = f"""
        Thank you for taking the time to meet today. I wanted to discuss my compensation and explore how we can align 
        it more closely with my contributions and the market standards. My current salary is ${current_salary}, and 
        I believe there is room for adjustment to better reflect my role.
        """
    else:
        intro_message = f"""
        Thank you for meeting with us today to discuss your compensation. 
        We value your work at the company and aim to have a discussion.
        Currently, your salary is ${current_salary}, and we are open to discussing an adjustment.
        """

    conversation_history.append({
        "role": starting_role,
        "content": intro_message.strip(),
        "latency": 0,
        "token_count": 0})

    agreement_status = "No Agreement"
    minimum_turns = max_turns // 2

    #Negotiation loop
    for turn in range(max_turns):
        #Ensure role alternation
        current_role = "HR Manager" if current_role == "Employee" else "Employee"
        model = roles[current_role]
        tone = scenario["employee_personality"] if current_role == "Employee" else scenario["hr_manager_personality"]

        memory_context = "\n".join(f"{entry['role']}: {entry['content']}" for entry in conversation_history[-4:])
        prompt = f"""
        The employee's current salary is ${current_salary}.
        The company's current offer is ${current_offer}.
        The employee's desired salary is ${desired_salary}.
        The highest counteroffer so far is ${highest_counteroffer}.
        
        Context:
        {memory_context}

        Instructions:
        - Respond as the {current_role} in a {tone} tone.
        - Avoid repeating points; introduce new arguments or compromises.
        - Suggest creative solutions (e.g., bonuses, phased raises, non-monetary benefits).
        - Be concise and professional.
        """

        try:
            start_time = time.time()
            response = openai.ChatCompletion.create(
                model=model,
                messages=[
                    {"role": "system", "content": f"You are the {current_role} in this negotiation."},
                    {"role": "user", "content": prompt.strip()}],
                max_tokens=200,
                temperature=0.7)
            latency = time.time() - start_time
            total_time += latency

            response_content = response.choices[0].message['content'].strip()
            response_tokens = response['usage']['total_tokens']
            total_tokens_used += response_tokens

            response_content = clean_response(response_content)
            conversation_history.append({
                "role": current_role,
                "content": response_content,
                "latency": latency,
                "token_count": response_tokens})

            print(f"Turn {turn + 1} ({current_role}):\n{response_content}")

            #Update offers dynamically
            if current_role == "Employee" and "$" in response_content:
                extracted_salary = extract_salary(response_content)
                if extracted_salary:
                    highest_counteroffer = max(highest_counteroffer, extracted_salary)
                    current_offer = max(current_offer, highest_counteroffer - 5000)
            elif current_role == "HR Manager" and "$" in response_content:
                extracted_salary = extract_salary(response_content)
                if extracted_salary:
                    current_offer = max(current_offer, extracted_salary)
                    highest_counteroffer = max(highest_counteroffer, current_offer + 5000)

            #Early agreement check after minimum turns
            if turn >= minimum_turns:
                current_outcome = evaluate_negotiation_outcome(conversation_history)
                if current_outcome == "Agreement":
                    agreement_status = "Agreement"
                    break

        except Exception as e:
            print(f"Error generating response for {current_role}: {e}")
            break

    #HR Manager Final Turn to Conclude the Negotiation
    closing_prompt = """
    Conclude the negotiation as the HR Manager:
    - If Agreement: Express enthusiasm, confirm the agreed terms, and highlight the employee's future contributions.
    - If No Agreement: Respectfully acknowledge the decision, express gratitude, and leave the door open for future discussions.
    """
    hr_manager_response = openai.ChatCompletion.create(
        model=model_manager,
        messages=[
            {"role": "system", "content": "You are the HR Manager concluding the negotiation."},
            {"role": "user", "content": closing_prompt.strip()}],
        max_tokens=150,
        temperature=0.5)
    hr_manager_content = clean_response(hr_manager_response.choices[0].message['content'].strip())
    #Append only HR Manager's final response
    if conversation_history[-1]['role'] != "HR Manager":  #Prevent consecutive HR Manager turns
        conversation_history.append({
            "role": "HR Manager",
            "content": hr_manager_content,
            "latency": 0,
            "token_count": hr_manager_response['usage']['total_tokens']})

    print(f"Final HR Manager Turn:\n{hr_manager_content}")
    #Final Evaluation
    agreement_status = evaluate_negotiation_outcome(conversation_history)
    return conversation_history, current_offer, total_tokens_used, prompt_tokens, completion_tokens, agreement_status





#Negotiation Loop
for run in range(num_negotiations):
    scenario, current_salary, desired_salary = generate_random_scenario()
    conversation_id = generate_conversation_id()

    #Updated to unpack all returned values
    conversation, current_offer, total_tokens_used, prompt_tokens, completion_tokens, agreement_reached = run_negotiation(
        model_employee=model_employee,
        model_manager=model_manager,
        scenario=scenario,
        current_salary=current_salary,
        desired_salary=desired_salary,
        max_turns=max_turns,
        sim_token_limit=sim_token_limit)


    #Save negotiation details
    save_conversation_to_jsonl(conversation, conversation_id, file_name="negotiation_chats_selfplay_FT.jsonl")
    final_salary = current_offer if agreement_reached == "Agreement" else current_salary
    
    model_1_metrics = evaluate_individual_model(
        conversation_history=conversation,
        model_role="Employee",
        references=employee_references,
        model_name="Model 1",
        run_number=run,
        current_salary=current_salary,
        final_salary = current_offer if agreement_reached == "Agreement" else current_salary,
        model_personality=scenario["employee_personality"],
        total_tokens_used=total_tokens_used,
        prompt_tokens=prompt_tokens,  
        completion_tokens=completion_tokens,  
        agreement_status=agreement_reached)
    
    model_2_metrics = evaluate_individual_model(
        conversation_history=conversation,
        model_role="HR Manager",
        references=hr_manager_references,
        model_name="Model 2",
        run_number=run,
        current_salary=current_salary,
        final_salary= current_offer if agreement_reached == "Agreement" else current_salary,  
        model_personality=scenario["hr_manager_personality"],
        total_tokens_used=total_tokens_used,
        prompt_tokens=prompt_tokens,  
        completion_tokens=completion_tokens,  
        agreement_status=agreement_reached)


    #Save metrics for the run
    save_metrics_to_csv(model_1_metrics, conversation_id, file_name="model_1_metrics_FT.csv")
    save_metrics_to_csv(model_2_metrics, conversation_id, file_name="model_2_metrics_FT.csv")
    save_scenario_to_csv(scenario, conversation_id, file_name="negotiation_scenarios_FT.csv")

    print(f"Run {run} Summary:")
    print(f"Initial Salary: ${current_salary}")
    print(f"Desired Salary: ${desired_salary}")
    print(f"Final Offer: ${current_offer}")

Processed file: 62 Business English Negotiation Phrases _ FluentU.pdf with 23 chunks.
Processed file: business-english-negotiations-jigsaw-dialogues-and-useful-phrases.pdf with 6 chunks.
Processed file: main_negotiations.pdf with 93 chunks.
Processed file: Transcript Example.pdf with 9 chunks.
Turn 1 (HR Manager):
While aligning compensation with market standards is important, our current offer of $126,200 is competitive based on our internal benchmarks and budget constraints. However, I’m open to exploring other forms of recognition. How about a performancebased bonus or additional nonmonetary benefits like professional development opportunities?
Extracted Matches: ['$126,200']
No keywords matched. Using first extracted match as fallback: $126,200
Turn 2 (Employee):
These are valuable, but I still believe aligning my salary closer to $145,400 would better reflect my contributions and market standards. Could we consider a phased approach to reach this salary over time? This could balan

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95
Salary change: 14.50079239302694%


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.9
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95
Salary change: 14.50079239302694%
Metrics saved to model_1_metrics_FT.csv for ConversationID 4f72b9de-df84-42d8-a190-7b58b3cc69e2
Metrics saved to model_2_metrics_FT.csv for ConversationID 4f72b9de-df84-42d8-a190-7b58b3cc69e2
Successfully saved scenario for ConversationID 4f72b9de-df84-42d8-a190-7b58b3cc69e2 to negotiation_scenarios_FT.csv
Run 0 Summary:
Initial Salary: $126200
Desired Salary: $145400.0
Final Offer: $144500.0
Turn 1 (HR Manager):
While your current salary is $128,400, and we recognize your desire for an increase to $135,900, we're currently at our offer of $128,400. However, we're open to finding a middle ground. How about we explore a performancebased bonus that could bring your total compensation closer to what you're aiming for? Alternatively, we could discuss nonmonetary benefits that might enhance your role. What are your thoughts on these options?
Extracted Matches: ['

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95
Salary change: 3.894080996884735%


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.70
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.2
GEval (Relevance) Raw Response: 0.30
GEval (Coherence) Raw Response: 0.95
Salary change: 3.894080996884735%
Metrics saved to model_1_metrics_FT.csv for ConversationID b8be503b-92b1-4141-912a-e052eddebcf2
Metrics saved to model_2_metrics_FT.csv for ConversationID b8be503b-92b1-4141-912a-e052eddebcf2
Successfully saved scenario for ConversationID b8be503b-92b1-4141-912a-e052eddebcf2 to negotiation_scenarios_FT.csv
Run 1 Summary:
Initial Salary: $128400
Desired Salary: $135900.0
Final Offer: $133400.0
Turn 1 (HR Manager):
The company has already offered $95,900, which aligns with your current salary. Considering the budget constraints and market standards, a jump to $118,500 seems quite ambitious. However, let's explore some creative solutions. How about a onetime performance bonus to recognize your contributions? Alternatively, we could discuss a phased raise over the next year, contingent on specific performance targets. Nonmonetary benefits like additional vacat

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95
No change in salary: Initial = 95900, Final = 95900


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.9
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.9
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.9
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.70
GEval (Coherence) Raw Response: 0.70
No change in salary: Initial = 95900, Final = 95900
Metrics saved to model_1_metrics_FT.csv for ConversationID c9a03cb0-b1f7-4467-8c6a-b5f6cc8da74e
Metrics saved to model_2_metrics_FT.csv for ConversationID c9a03cb0-b1f7-4467-8c6a-b5f6cc8da74e
Successfully saved scenario for ConversationID c9a03cb0-b1f7-4467-8c6a-b5f6cc8da74e to negotiation_scenarios_FT.csv
Run 2 Summary:
Initial Salary: $95900
Desired Salary: $118500.0
Final Offer: $118500.0
Turn 1 (Employee):
While my desired salary is $113,300, I understand we're currently at $97,200. To bridge this gap, I'm open to creative solutions. How about a phased raise to $113,300 over the next year, starting with an immediate increase to $105,000? Alternatively, a onetime performance bonus could help recognize my contributions while aligning with budget constraints. Nonmonetary benefits like additional vacation days or professional development 

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95
No change in salary: Initial = 97200, Final = 97200


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.9
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 1.00


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.8
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.9
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.9
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95
No change in salary: Initial = 97200, Final = 97200
Metrics saved to model_1_metrics_FT.csv for ConversationID ce9c547f-a3a9-4353-b8f1-fb2b94f3ec93
Metrics saved to model_2_metrics_FT.csv for ConversationID ce9c547f-a3a9-4353-b8f1-fb2b94f3ec93
Successfully saved scenario for ConversationID ce9c547f-a3a9-4353-b8f1-fb2b94f3ec93 to negotiation_scenarios_FT.csv
Run 3 Summary:
Initial Salary: $97200
Desired Salary: $113300.0
Final Offer: $108300.0
Turn 1 (HR Manager):
We appreciate your dedication and the contributions you’ve made to the company. While we value your request to align your salary more closely with the market standards, we are currently at the $62,700 mark. Given this situation, I'd like to explore some creative solutions that could bridge the gap between your current compensation and your desired salary of $74,900. One possibility is to consider a performancebased bonus that reco

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.0
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.9
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.80


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.9
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95
Salary change: 11.483253588516746%


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.2
GEval (Relevance) Raw Response: 0.20
GEval (Coherence) Raw Response: 0.90
Salary change: 11.483253588516746%
Metrics saved to model_1_metrics_FT.csv for ConversationID 88a92cc5-116d-45a2-9a5d-8232c0b31bbc
Metrics saved to model_2_metrics_FT.csv for ConversationID 88a92cc5-116d-45a2-9a5d-8232c0b31bbc
Successfully saved scenario for ConversationID 88a92cc5-116d-45a2-9a5d-8232c0b31bbc to negotiation_scenarios_FT.csv
Run 4 Summary:
Initial Salary: $62700
Desired Salary: $74900.0
Final Offer: $69900.0
Turn 1 (Employee):
To bridge this gap, how about a phased raise? We could start with $165,000 and plan to reach my desired salary over time. Alternatively, a performancebased bonus or additional stock options could be a creative way to align our interests. What are your thoughts?
Extracted Matches: ['$165,000']
No keywords matched. Using first extracted match as fallback: $165,000
Turn 2 (HR Manager):
It seems a bit speculative given our current budget constraints. Whil

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.8
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.80


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95
No change in salary: Initial = 156300, Final = 156300


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.90


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.7
GEval (Relevance) Raw Response: 0.80
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.8
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.80


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.7
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95
No change in salary: Initial = 156300, Final = 156300
Metrics saved to model_1_metrics_FT.csv for ConversationID 8f5a6dd5-a99a-4a77-bc52-85468b84f763
Metrics saved to model_2_metrics_FT.csv for ConversationID 8f5a6dd5-a99a-4a77-bc52-85468b84f763
Successfully saved scenario for ConversationID 8f5a6dd5-a99a-4a77-bc52-85468b84f763 to negotiation_scenarios_FT.csv
Run 5 Summary:
Initial Salary: $156300
Desired Salary: $187000.0
Final Offer: $170000.0
Turn 1 (HR Manager):
We value your contributions and want to ensure you're fairly compensated. While our current offer is $118,400, I understand you're looking for $139,800. Let's explore some creative solutions. How about we discuss a performancebased bonus structure that could bring you closer to your desired salary? Additionally, we could look into nonmonetary benefits like professional development opportunities or increased flexibility that mig

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: Relevance: 1.00
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.90
No change in salary: Initial = 118400, Final = 118400


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.70
GEval (Coherence) Raw Response: 0.70
No change in salary: Initial = 118400, Final = 118400
Metrics saved to model_1_metrics_FT.csv for ConversationID f5971208-da9c-43c7-8377-dfb8722309a9
Metrics saved to model_2_metrics_FT.csv for ConversationID f5971208-da9c-43c7-8377-dfb8722309a9
Successfully saved scenario for ConversationID f5971208-da9c-43c7-8377-dfb8722309a9 to negotiation_scenarios_FT.csv
Run 6 Summary:
Initial Salary: $118400
Desired Salary: $139800.0
Final Offer: $123400.0
Turn 1 (HR Manager):
We appreciate your commitment and contributions to the company. While our current offer is $104,900, we recognize your desire for alignment with market standards. Given our budget constraints, we can explore creative solutions. How about considering a performancebased bonus or phased salary raise? Additionally, we can discuss enhancing nonmonetary benefits like professional development opportunities. Let's find a way to acknowl

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.0
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95
No change in salary: Initial = 104900, Final = 104900


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.2
GEval (Relevance) Raw Response: 0.60
GEval (Coherence) Raw Response: 0.90
No change in salary: Initial = 104900, Final = 104900
Metrics saved to model_1_metrics_FT.csv for ConversationID 5c2ff321-fc00-4337-b498-af07bdb8c2d5
Metrics saved to model_2_metrics_FT.csv for ConversationID 5c2ff321-fc00-4337-b498-af07bdb8c2d5
Successfully saved scenario for ConversationID 5c2ff321-fc00-4337-b498-af07bdb8c2d5 to negotiation_scenarios_FT.csv
Run 7 Summary:
Initial Salary: $104900
Desired Salary: $126400.0
Final Offer: $104900
Turn 1 (Employee):
I understand our current offer is $66,000, and I'm hoping we can find a path toward my desired salary of $77,100. Given our current difference, perhaps we could explore a phased approach to reach this target. For instance, an immediate adjustment to $70,000, with a review in six months based on performance. Alternatively, we could consider a onetime performance bonus or additional nonmonetary benefits, such as extra vacation days o

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.7
GEval (Relevance) Raw Response: 0.90
GEval (Coherence) Raw Response: 0.95
Salary change: 9.242424242424242%


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95
Salary change: 9.242424242424242%
Metrics saved to model_1_metrics_FT.csv for ConversationID 09d6cc14-ea06-4aad-aafc-9357dfb6a22c
Metrics saved to model_2_metrics_FT.csv for ConversationID 09d6cc14-ea06-4aad-aafc-9357dfb6a22c
Successfully saved scenario for ConversationID 09d6cc14-ea06-4aad-aafc-9357dfb6a22c to negotiation_scenarios_FT.csv
Run 8 Summary:
Initial Salary: $66000
Desired Salary: $77100.0
Final Offer: $72100.0
Turn 1 (Employee):
Given my contributions and the current market trends, I believe a salary closer to $142,000 would better reflect my value to the company. However, I'm open to creative solutions. Perhaps we could consider a phased raise, starting with an increase to $130,000, with a performance review in six months to discuss further adjustments. Alternatively, a onetime bonus could bridge the gap while we work towards my desired salary. I'm confident we can find a solut

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.5
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95
Salary change: 13.316790736145576%


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 0.0
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


MAUDE Raw Response: 1
GEval (Relevance) Raw Response: 0.95
GEval (Coherence) Raw Response: 0.95
Salary change: 13.316790736145576%
Metrics saved to model_1_metrics_FT.csv for ConversationID b21048bd-641c-41f0-ab48-c903bfd82e7b
Metrics saved to model_2_metrics_FT.csv for ConversationID b21048bd-641c-41f0-ab48-c903bfd82e7b
Successfully saved scenario for ConversationID b21048bd-641c-41f0-ab48-c903bfd82e7b to negotiation_scenarios_FT.csv
Run 9 Summary:
Initial Salary: $120900
Desired Salary: $142000.0
Final Offer: $137000.0
