In [25]:
import time
import pandas as pd
from typing import List, Dict, Any
import os
from datasets import load_dataset
from groq import Groq
from datasets import load_dataset

from datasets import concatenate_datasets
from transformers import AutoTokenizer, AutoModel
import torch

from tqdm import tqdm



In [18]:
from datasets import load_dataset
 
ds = load_dataset("routellm/gpt4_dataset")

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train.jsonl:   0%|          | 0.00/290M [00:00<?, ?B/s]

valid.jsonl:   0%|          | 0.00/26.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/109101 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [19]:
ds

DatasetDict({
    train: Dataset({
        features: ['prompt', 'source', 'gpt4_response', 'mixtral_response', 'mixtral_score'],
        num_rows: 109101
    })
    validation: Dataset({
        features: ['prompt', 'source', 'gpt4_response', 'mixtral_response', 'mixtral_score'],
        num_rows: 10000
    })
})

In [20]:
ds['train'][0]

{'prompt': "I'll give you a review, can you extract the food aspects and the opinion words of these aspects and analyze the sentiment of these opinion from this review? the review is:They tore the old NAME_1 down then built another one...? Anyway, they sell wine and beer and snacks and have a seating area inside and outside to eat. Besides gas, the big draw is the Made to Order food. I ordered some tacos and French toast sticks both were pretty good. I think I'd like to try more snacks.And they're open 24/7.",
 'source': ['lmsys-chat-1m'],
 'gpt4_response': "Sure, here's the analysis of the review:\n\n1. Aspect: Wine and beer\n   Opinion Words: sell\n   Sentiment: Neutral\n\n2. Aspect: Snacks\n   Opinion Words: sell, try more\n   Sentiment: Positive\n\n3. Aspect: Seating area (inside and outside)\n   Opinion Words: have\n   Sentiment: Neutral\n\n4. Aspect: Gas\n   Opinion Words: sell\n   Sentiment: Neutral\n\n5. Aspect: Made to Order food\n   Opinion Words: big draw\n   Sentiment: Posi

In [21]:
num_queries = len(ds['train'])
print(f"Number of queries in the training set: {num_queries}")


Number of queries in the training set: 109101


In [26]:


# Load the embedding model (adjust model path if needed)
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Replace with your embedding model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to compute embeddings
def get_embeddings(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)  # Use mean pooling
    return embeddings.cpu().numpy()


# Step 1: Subset the dataset
train_subset = ds['train'].select(range(2000))
validation_subset = ds['validation'].select(range(200))

# Combine the subsets
subset_ds = concatenate_datasets([train_subset, validation_subset])

# Now you can proceed with embedding and saving

# Step 2: Embed the queries
query_embeddings = []
for batch in tqdm(subset_ds['prompt'], desc="Embedding queries"):
    embeddings = get_embeddings([batch])
    query_embeddings.append(embeddings[0])  # Append the embedding of the current query

# Step 3: Add embeddings to the dataset
subset_ds = subset_ds.add_column("query_embedding", query_embeddings)

# Step 4: Convert to pandas DataFrame and save as CSV
df = subset_ds.to_pandas()
csv_path = "subset_dataset_with_embeddings.csv"
df.to_csv(csv_path, index=False)

print(f"Dataset with embeddings saved to {csv_path}")


Embedding queries: 100%|██████████| 2200/2200 [01:13<00:00, 29.87it/s]


Dataset with embeddings saved to subset_dataset_with_embeddings.csv


In [None]:

class QueryEvaluator:
    def __init__(self, api_key: str):
        """
        Initialize the evaluator with API key.
        """
        self.client = Groq(api_key=api_key)
        self.models = {
            "mixtral-8x7b-32768": "mixtral",
            "llama-3.1-70b-versatile": "llama"
        }

    def evaluate_query(self, query: str, model: str, max_tokens: int = 200) -> dict:
        """
        Evaluate a single query using the specified model.
        """
        start_time = time.time()
        try:
            completion = self.client.chat.completions.create(
                messages=[{"role": "user", "content": query}],
                model=model,
                max_tokens=max_tokens,
            )
            response_time = time.time() - start_time
            response = completion.choices[0].message.content
            estimated_tokens = len(query.split()) + len(response.split())
            
            return {
                "success": True,
                "response": response,
                "latency": response_time,
                "estimated_tokens": estimated_tokens,
                "error": None,
            }
        except Exception as e:
            return {
                "success": False,
                "response": None,
                "latency": time.time() - start_time,
                "estimated_tokens": 0,
                "error": str(e),
            }

    def process_queries(self, queries: list, max_tokens: int = 200) -> pd.DataFrame:
        """
        Process a list of queries and capture results from both models.
        """
        results = []
        for i, query in enumerate(queries, 1):
            print(f"Processing query {i}/{len(queries)}...")

            # Store responses for both models
            model_responses = {"query": query}
            
            for model, model_name in self.models.items():
                print(f"  Querying model: {model}")
                result = self.evaluate_query(query, model, max_tokens)
                
                model_responses[f"{model_name}_response"] = result["response"]
                model_responses[f"{model_name}_latency"] = result["latency"]
                model_responses[f"{model_name}_success"] = result["success"]
                model_responses[f"{model_name}_error"] = result["error"]
            
            results.append(model_responses)
        
        return pd.DataFrame(results)

def main():
    # Your Groq API key
    api_key = "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"
    
    # Load the prepared dataset with 2200 queries
    queries = subset_ds["prompt"]  # Assuming `subset_ds` is already prepared

    # Initialize the evaluator
    evaluator = QueryEvaluator(api_key)
    
    # Process queries with both models
    results_df = evaluator.process_queries(queries)

    # Save results to CSV for later analysis
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    results_df.to_csv(f"query_comparisons_{timestamp}.csv", index=False)

    # Print summary of captured data
    print("\nSample Data:")
    print("===================")
    print(results_df.head())

if __name__ == "__main__":
    main()


Processing prompt 1/1000...
Processing prompt 2/1000...
Processing prompt 3/1000...
Processing prompt 4/1000...
Processing prompt 5/1000...
Processing prompt 6/1000...
Processing prompt 7/1000...
Processing prompt 8/1000...
Processing prompt 9/1000...
Processing prompt 10/1000...
Processing prompt 11/1000...
Processing prompt 12/1000...
Processing prompt 13/1000...
Processing prompt 14/1000...
Processing prompt 15/1000...
Processing prompt 16/1000...
Processing prompt 17/1000...
Processing prompt 18/1000...
Processing prompt 19/1000...
Processing prompt 20/1000...
Processing prompt 21/1000...
Processing prompt 22/1000...
Processing prompt 23/1000...
Processing prompt 24/1000...
Processing prompt 25/1000...
Processing prompt 26/1000...
Processing prompt 27/1000...
Processing prompt 28/1000...
Processing prompt 29/1000...
Processing prompt 30/1000...
Processing prompt 31/1000...
Processing prompt 32/1000...
Processing prompt 33/1000...
Processing prompt 34/1000...
Processing prompt 35/10

KeyboardInterrupt: 

In [30]:
results_df=pd.read_csv("query_comparisons_20241210_021314.csv")

In [34]:
results_df.shape

(2200, 9)

In [36]:
results_df.head()

Unnamed: 0,query,mixtral_response,mixtral_latency,mixtral_success,mixtral_error,llama_response,llama_latency,llama_success,llama_error
0,"I'll give you a review, can you extract the fo...",Food aspects:\n\n1. Made to Order food\n2. Tac...,0.980199,True,,The food aspects mentioned in the review with ...,1.268872,True,
1,"Answer the following question: Claim: ""Joker m...",The answer is yes.\n\nChain of thoughts: Stan ...,0.537827,True,,My answer is: yes.\n\nChain of thoughts: Stan ...,0.953742,True,
2,TASK DEFINITION: In this task you will be give...,"ZdoublexpropheciesS\n\nIn this task, you are g...",0.680184,True,,"The solution for the problem ['Z', 'double', '...",0.504239,True,
3,"Detailed Instructions: In this task, you need ...","Yes, 'station' is one of the longest words in ...",0.462744,True,,"To solve this task, we need to count the numbe...",1.238112,True,
4,A guy pick up a woman Then he puts her down Ex...,This joke is a play on words and relies on the...,0.70255,True,,This joke is a play on words. When someone 'pi...,1.235937,True,


In [37]:

#
embeddings_df = pd.read_csv("subset_dataset_with_embeddings.csv")  # Replace with your embeddings file path
embeddings_df.head()


Unnamed: 0,prompt,source,gpt4_response,mixtral_response,mixtral_score,query_embedding
0,"I'll give you a review, can you extract the fo...",['lmsys-chat-1m'],"Sure, here's the analysis of the review:\n\n1....",Food aspects and opinion words:\n\n1. Made to ...,4,[-1.65989585e-02 4.53538746e-02 3.10476087e-...
1,"Answer the following question: Claim: ""Joker m...",['flan_v2_cot'],The answer is no.\nChain of thoughts: Stan Lee...,The answer is no.\n\nChain of thoughts: While ...,5,[-1.00538790e-01 -3.46924104e-02 -5.86108454e-...
2,TASK DEFINITION: In this task you will be give...,['flan_v2_niv2'],ZdoublexpropheciesS,"ZdoublexpropheciesS\n\nIn this task, you are a...",5,[-4.94029261e-02 1.42426789e-01 6.33770302e-...
3,"Detailed Instructions: In this task, you need ...",['flan_v2_niv2'],Yes,"No, 'station' is not the longest word in the s...",5,[ 1.41500324e-01 -8.15508328e-03 -3.68737467e-...
4,A guy pick up a woman Then he puts her down Ex...,['sharegpt'],This phrase could be interpreted as a joke bec...,This joke is a play on words and relies on the...,5,[-1.86245084e-01 1.05034925e-01 -7.69985616e-...


In [41]:
# Rename the 'prompt' column to 'query' and select specific columns
df = embeddings_df.rename(columns={"prompt": "query"})[["query", "gpt4_response", "query_embedding"]]

print(df.head())


                                               query  \
0  I'll give you a review, can you extract the fo...   
1  Answer the following question: Claim: "Joker m...   
2  TASK DEFINITION: In this task you will be give...   
3  Detailed Instructions: In this task, you need ...   
4  A guy pick up a woman Then he puts her down Ex...   

                                       gpt4_response  \
0  Sure, here's the analysis of the review:\n\n1....   
1  The answer is no.\nChain of thoughts: Stan Lee...   
2                                ZdoublexpropheciesS   
3                                                Yes   
4  This phrase could be interpreted as a joke bec...   

                                     query_embedding  
0  [-1.65989585e-02  4.53538746e-02  3.10476087e-...  
1  [-1.00538790e-01 -3.46924104e-02 -5.86108454e-...  
2  [-4.94029261e-02  1.42426789e-01  6.33770302e-...  
3  [ 1.41500324e-01 -8.15508328e-03 -3.68737467e-...  
4  [-1.86245084e-01  1.05034925e-01 -7.69985616e-..

In [42]:
 #Step 2: Ensure the embedding dataset has a common column (e.g., 'query')
# Assuming embeddings_df contains 'query' and 'query_embedding' columns

# Step 3: Merge the embeddings into the result dataframe
merged_df = pd.merge(results_df, df, on="query", how="left")

# Step 4: Save the merged dataframe
merged_df.to_csv("merged_df.csv", index=False)

print("Merged dataframe saved as 'result_with_embeddings.csv'.")


Merged dataframe saved as 'result_with_embeddings.csv'.


In [43]:
merged_df.head()

Unnamed: 0,query,mixtral_response,mixtral_latency,mixtral_success,mixtral_error,llama_response,llama_latency,llama_success,llama_error,gpt4_response,query_embedding
0,"I'll give you a review, can you extract the fo...",Food aspects:\n\n1. Made to Order food\n2. Tac...,0.980199,True,,The food aspects mentioned in the review with ...,1.268872,True,,"Sure, here's the analysis of the review:\n\n1....",[-1.65989585e-02 4.53538746e-02 3.10476087e-...
1,"Answer the following question: Claim: ""Joker m...",The answer is yes.\n\nChain of thoughts: Stan ...,0.537827,True,,My answer is: yes.\n\nChain of thoughts: Stan ...,0.953742,True,,The answer is no.\nChain of thoughts: Stan Lee...,[-1.00538790e-01 -3.46924104e-02 -5.86108454e-...
2,TASK DEFINITION: In this task you will be give...,"ZdoublexpropheciesS\n\nIn this task, you are g...",0.680184,True,,"The solution for the problem ['Z', 'double', '...",0.504239,True,,ZdoublexpropheciesS,[-4.94029261e-02 1.42426789e-01 6.33770302e-...
3,"Detailed Instructions: In this task, you need ...","Yes, 'station' is one of the longest words in ...",0.462744,True,,"To solve this task, we need to count the numbe...",1.238112,True,,Yes,[ 1.41500324e-01 -8.15508328e-03 -3.68737467e-...
4,A guy pick up a woman Then he puts her down Ex...,This joke is a play on words and relies on the...,0.70255,True,,This joke is a play on words. When someone 'pi...,1.235937,True,,This phrase could be interpreted as a joke bec...,[-1.86245084e-01 1.05034925e-01 -7.69985616e-...


In [45]:
import pandas as pd
import time
from groq import Groq  # Assuming this is your LLM API library

class JudgeLLM:
    def __init__(self, api_key: str, judge_model: str = "llama-3.3-70b-versatile"):
        """
        Initialize the judge model with the API key.
        """
        self.client = Groq(api_key=api_key)
        self.judge_model = judge_model

    def evaluate_responses(self, query: str, mixtral_response: str, llama_response: str, max_tokens: int = 200) -> dict:
        """
        Evaluate the quality of two model responses using the judge model.
        """
        prompt = (
            f"You are an impartial judge tasked with evaluating two responses to the same query.\n\n"
            f"Query: {query}\n\n"
            f"Response 1 (Mixtral): {mixtral_response}\n\n"
            f"Response 2 (Llama): {llama_response}\n\n"
            f"Your task is to compare the two responses based on clarity, relevance, and completeness. "
            f"Assign a score between 0 and 10 for each response and provide reasoning for your scores.\n\n"
            f"Format:\n"
            f"Response 1 score: [score]\nResponse 2 score: [score]\nReasoning: [reasoning]\n"
        )

        try:
            completion = self.client.chat.completions.create(
                messages=[{"role": "user", "content": prompt}],
                model=self.judge_model,
                max_tokens=max_tokens,
            )
            response_content = completion.choices[0].message.content

            # Extract scores and reasoning
            response_1_score = None
            response_2_score = None
            reasoning = None

            for line in response_content.split("\n"):
                if line.startswith("Response 1 score:"):
                    response_1_score = float(line.split(":")[1].strip())
                elif line.startswith("Response 2 score:"):
                    response_2_score = float(line.split(":")[1].strip())
                elif line.startswith("Reasoning:"):
                    reasoning = line.split("Reasoning:")[1].strip()

            return {
                "success": True,
                "mixtral_score": response_1_score,
                "llama_score": response_2_score,
                "reasoning": reasoning,
                "error": None,
            }
        except Exception as e:
            print(f"Error evaluating responses: {e}")
            return {
                "success": False,
                "mixtral_score": None,
                "llama_score": None,
                "reasoning": None,
                "error": str(e),
            }

    def process_data(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Process a DataFrame of queries and responses, evaluate them, and compute final scores.
        """
        # Normalize latencies for scoring
        mixtral_latencies = data["mixtral_latency"]
        llama_latencies = data["llama_latency"]
        min_latency = min(mixtral_latencies.min(), llama_latencies.min())
        max_latency = max(mixtral_latencies.max(), llama_latencies.max())

        data["mixtral_latency_norm"] = (mixtral_latencies - min_latency) / (max_latency - min_latency)
        data["llama_latency_norm"] = (llama_latencies - min_latency) / (max_latency - min_latency)

        # List to store results
        results = []

        for i, row in data.iterrows():
            print(f"Evaluating row {i + 1}/{len(data)}...")

            result = self.evaluate_responses(
                query=row["query"],
                mixtral_response=row["mixtral_response"],
                llama_response=row["llama_response"],
            )

            if result["success"]:
                # Final scores: Quality - (weight * normalized latency)
                mixtral_final_score = result["mixtral_score"] - (0.2 * row["mixtral_latency_norm"])
                llama_final_score = result["llama_score"] - (0.2 * row["llama_latency_norm"])
            else:
                mixtral_final_score = None
                llama_final_score = None

            results.append({
                "mixtral_score": mixtral_final_score,
                "llama_score": llama_final_score,
                "reasoning": result["reasoning"],
                "error": result["error"],
            })

        # Append results back to the original DataFrame
        results_df = pd.DataFrame(results)
        return pd.concat([data, results_df], axis=1)

# Main Execution
def main():
    # Your API key
    api_key = "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"

    # Load your merged DataFrame
    merged_df = pd.read_csv("merged_df.csv")  # Replace with your actual file

    # Initialize the Judge model
    judge = JudgeLLM(api_key)

    # Process the data to generate scores
    scored_df = judge.process_data(merged_df)

    # Save the results
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    scored_df.to_csv(f"scored_results_{timestamp}.csv", index=False)

    print("Scoring complete! Results saved.")

if __name__ == "__main__":
    main()


Evaluating row 1/2200...
Evaluating row 2/2200...
Evaluating row 3/2200...
Evaluating row 4/2200...
Evaluating row 5/2200...
Evaluating row 6/2200...
Evaluating row 7/2200...
Evaluating row 8/2200...
Evaluating row 9/2200...
Evaluating row 10/2200...
Evaluating row 11/2200...
Evaluating row 12/2200...
Evaluating row 13/2200...
Evaluating row 14/2200...
Evaluating row 15/2200...
Evaluating row 16/2200...
Evaluating row 17/2200...
Evaluating row 18/2200...
Evaluating row 19/2200...
Evaluating row 20/2200...
Evaluating row 21/2200...
Evaluating row 22/2200...
Evaluating row 23/2200...
Evaluating row 24/2200...
Evaluating row 25/2200...
Evaluating row 26/2200...
Evaluating row 27/2200...
Evaluating row 28/2200...
Evaluating row 29/2200...
Evaluating row 30/2200...
Evaluating row 31/2200...
Evaluating row 32/2200...
Evaluating row 33/2200...
Evaluating row 34/2200...
Evaluating row 35/2200...
Evaluating row 36/2200...
Evaluating row 37/2200...
Evaluating row 38/2200...
Evaluating row 39/220

In [49]:
scored_df=pd.read_csv("scored_results_20241210_114235.csv")

In [51]:
scored_df.head()

Unnamed: 0,query,mixtral_response,mixtral_latency,mixtral_success,mixtral_error,llama_response,llama_latency,llama_success,llama_error,gpt4_response,query_embedding,mixtral_latency_norm,llama_latency_norm,mixtral_score,llama_score,reasoning,error
0,"I'll give you a review, can you extract the fo...",Food aspects:\n\n1. Made to Order food\n2. Tac...,0.980199,True,,The food aspects mentioned in the review with ...,1.268872,True,,"Sure, here's the analysis of the review:\n\n1....",[-1.65989585e-02 4.53538746e-02 3.10476087e-...,0.000194,0.000273,7.999961,6.999945,Both responses effectively identify the food a...,
1,"Answer the following question: Claim: ""Joker m...",The answer is yes.\n\nChain of thoughts: Stan ...,0.537827,True,,My answer is: yes.\n\nChain of thoughts: Stan ...,0.953742,True,,The answer is no.\nChain of thoughts: Stan Lee...,[-1.00538790e-01 -3.46924104e-02 -5.86108454e-...,7.2e-05,0.000187,7.999986,8.999963,"Both responses accurately answer the question,...",
2,TASK DEFINITION: In this task you will be give...,"ZdoublexpropheciesS\n\nIn this task, you are g...",0.680184,True,,"The solution for the problem ['Z', 'double', '...",0.504239,True,,ZdoublexpropheciesS,[-4.94029261e-02 1.42426789e-01 6.33770302e-...,0.000111,6.3e-05,7.999978,1.999987,Response 1 provides a clear explanation of the...,
3,"Detailed Instructions: In this task, you need ...","Yes, 'station' is one of the longest words in ...",0.462744,True,,"To solve this task, we need to count the numbe...",1.238112,True,,Yes,[ 1.41500324e-01 -8.15508328e-03 -3.68737467e-...,5.1e-05,0.000265,7.99999,8.999947,Both responses are clear and relevant to the q...,
4,A guy pick up a woman Then he puts her down Ex...,This joke is a play on words and relies on the...,0.70255,True,,This joke is a play on words. When someone 'pi...,1.235937,True,,This phrase could be interpreted as a joke bec...,[-1.86245084e-01 1.05034925e-01 -7.69985616e-...,0.000118,0.000264,7.999976,8.999947,Both responses effectively explain the joke an...,


In [52]:
# Add a winning_model column
scored_df["winning_model"] = scored_df.apply(
    lambda row: 0 if row["mixtral_score"] > row["llama_score"] else 1,
    axis=1
)

# Retain only required columns
final_df = scored_df[["query", "query_embedding", "winning_model"]]

# Save the final dataset
final_df.to_csv("final_results.csv", index=False)

print("Winning model column added! Final dataset saved as 'final_results.csv'.")


Winning model column added! Final dataset saved as 'final_results.csv'.


In [53]:
final_df

Unnamed: 0,query,query_embedding,winning_model
0,"I'll give you a review, can you extract the fo...",[-1.65989585e-02 4.53538746e-02 3.10476087e-...,0
1,"Answer the following question: Claim: ""Joker m...",[-1.00538790e-01 -3.46924104e-02 -5.86108454e-...,1
2,TASK DEFINITION: In this task you will be give...,[-4.94029261e-02 1.42426789e-01 6.33770302e-...,0
3,"Detailed Instructions: In this task, you need ...",[ 1.41500324e-01 -8.15508328e-03 -3.68737467e-...,1
4,A guy pick up a woman Then he puts her down Ex...,[-1.86245084e-01 1.05034925e-01 -7.69985616e-...,1
...,...,...,...
2195,"Does wearing a tightly curled permanent wave, ...",[-2.13785380e-01 -3.14125307e-02 2.85526335e-...,0
2196,Give me an introduction over 200 words for Adv...,[-2.79427171e-02 -6.70431107e-02 1.10476077e-...,1
2197,How do I prevent my child from being so moody?,[ 2.69970357e-01 1.72942981e-01 -8.17432851e-...,0
2198,Here is a bag filled with popcorn. There is no...,[ 3.59903835e-02 -7.45636299e-02 1.49505854e-...,0


In [89]:

# Assuming final_df is your DataFrame and it has a 'winning_model' column with 1's and 0's
winning_model_counts = final_df['winning_model'].value_counts()

# Display the count of 1's and 0's
print("Count of 1's and 0's in 'winning_model':")
print(winning_model_counts)


Count of 1's and 0's in 'winning_model':
winning_model
0    1527
1     673
Name: count, dtype: int64


In [66]:
final_df["query_embedding"][0]

'[-1.65989585e-02  4.53538746e-02  3.10476087e-02  1.64297804e-01\n -6.24837019e-02  6.71825279e-03  6.19063564e-02 -9.99638289e-02\n -1.23116411e-01 -1.30120382e-01  6.45101219e-02  6.84042498e-02\n -7.42618814e-02 -9.40614566e-02  6.01368323e-02 -1.14839844e-01\n  3.64858747e-01 -2.12769061e-01 -5.86363906e-03 -4.93275560e-02\n -6.28197864e-02 -5.06482534e-02  1.90066174e-01  2.74788812e-02\n  4.10084352e-02  1.86575344e-04 -4.81429473e-02  1.56560794e-01\n -8.38844329e-02 -1.43647015e-01  5.78125529e-02  1.18715018e-01\n  2.79987440e-03  1.40256863e-02 -5.38165607e-02 -2.35358514e-02\n  2.04037070e-01 -1.21434882e-01  6.33009002e-02  1.94237903e-02\n -6.33556675e-03 -7.46783391e-02 -6.97223376e-03 -5.80873201e-03\n  2.37664394e-02 -1.08556695e-01 -1.17531545e-01 -1.02338437e-02\n  8.12104568e-02  5.10205515e-02 -1.09363981e-01 -1.90825742e-02\n  5.53605370e-02 -1.61769494e-01  1.01139508e-01 -1.18601797e-02\n -1.03202946e-01 -9.76015851e-02 -1.17213167e-02  5.58860824e-02\n  2.30589

In [56]:
print(train_df['query_embedding'].head())


1258    [ 3.15726280e-01 -1.20050713e-01 -8.99756029e-...
1270    [ 8.83980468e-03  9.72204730e-02 -9.86268744e-...
1057    [ 2.23503172e-01 -5.24089206e-04  1.04839273e-...
1950    [-2.10097358e-02  1.41136214e-01 -8.60385820e-...
48      [-6.42807782e-02  4.69430014e-02 -2.19517410e-...
Name: query_embedding, dtype: object


## Model training

In [75]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, pairwise_distances
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [72]:


# Assuming train_df and test_df are already loaded with 'query' and 'winning_model' columns

# Step 1: Data Preparation
# Split the dataset into train (2000 queries) and test (200 queries)
train_queries, test_queries, train_labels, test_labels = train_test_split(
    train_df['query'], train_df['winning_model'], test_size=0.1, random_state=42)



In [73]:
# Step 2: Feature Engineering
# Use TfidfVectorizer to convert queries into numerical features
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust max_features as needed
train_features = vectorizer.fit_transform(train_queries)
test_features = vectorizer.transform(test_queries)



In [76]:
# Step 3: Train the Random Forest Classifier
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_classifier.fit(train_features, train_labels)



In [77]:
# Step 4: Evaluate the Model
# Make predictions on the test set
test_predictions = random_forest_classifier.predict(test_features)

# Evaluate the classification accuracy
classification_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Random Forest Classification Accuracy: {classification_accuracy * 100:.2f}%")

# Step 5: Test with Query Similarity (using embeddings for testing only)
# Use cosine similarity on query embeddings to find the most similar query in the training set



Random Forest Classification Accuracy: 69.50%


In [78]:
scored_df.head()from sklearn.metrics.pairwise import cosine_similarity

def predict_with_similarity(test_query, train_queries, train_embeddings, train_labels):
    # Compute embeddings for test query (using the same vectorizer)
    test_query_embedding = vectorizer.transform([test_query])
    
    # Compute cosine similarity between test query embedding and all train query embeddings
    similarities = cosine_similarity(test_query_embedding, train_embeddings)
    
    # Get the index of the most similar training query
    most_similar_index = similarities.argmax()
    
    # Return the predicted model based on the most similar training query
    return train_labels.iloc[most_similar_index]

# Convert train queries to embeddings for similarity-based testing
train_embeddings = vectorizer.transform(train_queries)

# Test similarity-based prediction accuracy
similarity_predictions = [
    predict_with_similarity(test_query, train_queries, train_embeddings, train_labels)
    for test_query in test_queries
]
similarity_accuracy = accuracy_score(test_labels, similarity_predictions)

print(f"Similarity-based Prediction Accuracy: {similarity_accuracy * 100:.2f}%")


SyntaxError: invalid syntax (784198224.py, line 1)

In [86]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp312-cp312-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow)
  Downloading tensorflow_intel-2.18.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorf

## other approach

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Assuming train_df and test_df are already loaded with 'query' and 'winning_model' columns

# Step 1: Data Preparation
train_queries, test_queries, train_labels, test_labels = train_test_split(
    train_df['query'], train_df['winning_model'], test_size=0.1, random_state=42)

# Step 2: Feature Engineering
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
train_features = vectorizer.fit_transform(train_queries).toarray()
test_features = vectorizer.transform(test_queries).toarray()

# Step 3: Neural Network Model
model = Sequential()

# Input layer (size matches the number of features from TF-IDF)
model.add(Dense(512, input_dim=train_features.shape[1], activation='relu'))
model.add(Dropout(0.5))  # Dropout layer to prevent overfitting

# Hidden layer
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))

# Output layer (assuming binary classification; if more classes, adjust accordingly)
model.add(Dense(len(train_labels.unique()), activation='softmax'))  # 'softmax' for multi-class

# Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Step 4: Train the Model
model.fit(train_features, train_labels, epochs=100, batch_size=32, validation_split=0.2)

# Step 5: Evaluate the Model
test_predictions = model.predict(test_features)
test_predictions = np.argmax(test_predictions, axis=1)  # Get the class with the highest probability

# Convert test_labels to numpy array for compatibility
test_labels = np.array(test_labels)

# Accuracy Evaluation
nn_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Neural Network Classification Accuracy: {nn_accuracy * 100:.2f}%")



Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.6571 - loss: 0.6489 - val_accuracy: 0.7056 - val_loss: 0.6033
Epoch 2/50
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.6888 - loss: 0.5736 - val_accuracy: 0.7056 - val_loss: 0.6213
Epoch 3/50
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.7532 - loss: 0.4248 - val_accuracy: 0.6333 - val_loss: 0.7871
Epoch 4/50
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.9418 - loss: 0.1926 - val_accuracy: 0.6167 - val_loss: 1.0689
Epoch 5/50
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.9695 - loss: 0.0902 - val_accuracy: 0.6111 - val_loss: 1.3212
Epoch 6/50
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.9867 - loss: 0.0455 - val_accuracy: 0.6056 - val_loss: 1.4717
Epoch 7/50
[1m51/51[0m [32m━━━━━━━━━━━━━━━

In [91]:
pip install TextBlob

Collecting TextBlob
  Using cached textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Using cached textblob-0.18.0.post0-py3-none-any.whl (626 kB)
Installing collected packages: TextBlob
Successfully installed TextBlob-0.18.0.post0
Note: you may need to restart the kernel to use updated packages.


In [90]:
import random
import nltk
from nltk.corpus import wordnet
from textblob import TextBlob
import numpy as np

# Download NLTK WordNet data
nltk.download('wordnet')
nltk.download('omw-1.4')

# Function to get synonyms for a word
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

# Function to augment query by replacing a word with a synonym
def synonym_replacement(query):
    words = query.split()
    new_query = words[:]
    
    # Randomly choose a word to replace
    word_to_replace = random.choice(words)
    synonyms = get_synonyms(word_to_replace)
    
    if synonyms:
        new_word = random.choice(synonyms)
        new_query = [new_word if word == word_to_replace else word for word in words]
    
    return " ".join(new_query)

# Function to introduce random noise by replacing a word
def add_random_noise(query, noise_level=0.1):
    words = query.split()
    num_words = len(words)
    num_changes = int(noise_level * num_words)
    
    for _ in range(num_changes):
        word_to_replace = random.choice(words)
        words[words.index(word_to_replace)] = random.choice(['...', '?', '!', '@', '#', ''])  # Add noise
        
    return " ".join(words)

# Function to paraphrase a query using TextBlob
def paraphrase_query(query):
    blob = TextBlob(query)
    # TextBlob’s paraphrasing feature is limited, so we can just use the noun phrases to paraphrase
    return blob.correct()  # A very basic form of paraphrasing (correction)

# Data augmentation function
def augment_data(query, num_augments=3):
    augmented_queries = []
    
    for _ in range(num_augments):
        # Randomly choose a method of augmentation
        aug_method = random.choice([synonym_replacement, add_random_noise, paraphrase_query])
        
        if aug_method == synonym_replacement:
            augmented_queries.append(synonym_replacement(query))
        elif aug_method == add_random_noise:
            augmented_queries.append(add_random_noise(query))
        elif aug_method == paraphrase_query:
            augmented_queries.append(paraphrase_query(query))
    
    return augmented_queries

# Example usage
original_query = "How do I improve my insurance claim process?"

# Augment the query
augmented_queries = augment_data(original_query, num_augments=5)

# Print augmented queries
for i, aug_query in enumerate(augmented_queries):
    print(f"Augmented Query {i+1}: {aug_query}")


ModuleNotFoundError: No module named 'textblob'