In [1]:
# import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer
import torch

In [2]:
df = pd.read_csv('OPA_Test_Cases/ChatGPT_Validated/chatgpt_validated4.csv')

In [3]:
df.shape

(117, 5)

In [4]:
df_invalid = df[df['Validation_Passed']!=True]
df_invalid.shape

(7, 5)

In [5]:
# Function to calculate cosine similarity
def cosine_similarity_tokenized_strings(string1, string2, tokenizer_name):
    # Initialize the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    # Tokenize the strings
    tokens1 = tokenizer(string1, return_tensors='pt', padding=True, truncation=True)
    tokens2 = tokenizer(string2, return_tensors='pt', padding=True, truncation=True)

    # Get token IDs
    input_ids1 = tokens1['input_ids']
    input_ids2 = tokens2['input_ids']

    # Ensure equal sequence length by padding or truncating
    max_length = max(input_ids1.shape[1], input_ids2.shape[1])
    input_ids1 = torch.nn.functional.pad(input_ids1, (0, max_length - input_ids1.shape[1]), value=tokenizer.pad_token_id)
    input_ids2 = torch.nn.functional.pad(input_ids2, (0, max_length - input_ids2.shape[1]), value=tokenizer.pad_token_id)

    # Flatten token IDs
    flat_input_ids1 = input_ids1.flatten()
    flat_input_ids2 = input_ids2.flatten()

    # Calculate cosine similarity
    similarity = cosine_similarity(flat_input_ids1.unsqueeze(0), flat_input_ids2.unsqueeze(0))

    return similarity[0][0]

In [6]:
df_similarity_scores = pd.DataFrame(columns=['Index','Directory', 'Query', 'PrivateGPT_Rule', 'Actual_Rule','Similarity_Score'])

In [7]:
for i in range(df_invalid.shape[0]):
    prompt = df_invalid.iloc[i,2]
    invalid_rule = df_invalid.iloc[i,3]
    directory = df_invalid.iloc[i,1]
    df_actual_rules = pd.read_csv('/home/tanmoy/OPA/'+directory+"/"+directory+"output_test.csv")
    valid_rule = df_actual_rules[df_actual_rules['prompt']==prompt].iloc[0,2]
    valid_rule = valid_rule[valid_rule.index('\n')+1:]
    similarity_score = cosine_similarity_tokenized_strings(invalid_rule,valid_rule,'Salesforce/codet5-small')
    if(df_similarity_scores.shape[0] == 0):
        df_similarity_scores['Index'] = [0]
        df_similarity_scores['Directory'] = [directory]
        df_similarity_scores['Query'] = [prompt]
        df_similarity_scores['PrivateGPT_Rule'] = [invalid_rule]
        df_similarity_scores['Actual_Rule'] = [valid_rule]
        df_similarity_scores['Similarity_Score']=[similarity_score]
    else:
        df_similarity_scores.loc[len(df_similarity_scores.index)] = [len(df_similarity_scores.index),directory,prompt,invalid_rule,valid_rule,similarity_score] 

In [26]:
df_similarity_scores.shape

(4, 6)

In [8]:
df_similarity_scores.head(8)

Unnamed: 0,Index,Directory,Query,PrivateGPT_Rule,Actual_Rule,Similarity_Score
0,0,KarlatIwoca,"Team admin, maintainer, observer_plus and obse...",allow { },"allow { \nobject.type == ""targeted_query"" \nob...",0.132475
1,1,weswhet,If role is maintainer on any team,"team_role(subject, subject.teams[_].id) == mai...","team_role(subject, subject.teams[_].id) == adm...",0.947022
2,2,y0zg,If role is maintainer on any team,"team_role(subject, subject.teams[_].id) == mai...","team_role(subject, subject.teams[_].id) == adm...",0.947022
3,3,stephanmiehe,If role is admin or maintainer on any team,"team_role(subject, team_id) == [admin, maintai...","team_role(subject, subject.teams[_].id) == [ad...",0.361995
4,4,kyle-humane,If role is admin or maintainer on any team,"team_role(subject, team_id) == [admin, maintai...","team_role(subject, subject.teams[_].id) == [ad...",0.361995
5,5,kapawit,"Team admin, maintainer, observer_plus and obse...",allow { },"allow { \nobject.type == ""targeted_query"" \nob...",0.132475
6,6,blazman,"Team admin, maintainer, observer_plus and obse...",allow { },"allow { \nobject.type == ""targeted_query"" \nob...",0.132475


In [9]:
df_similarity_scores['Similarity_Score'].mean()

0.4307796714146077

In [10]:
df_similarity_scores.to_csv('OPA_Test_Cases/ChatGPT_Validated/chatgpt_invalid_similarity_scores.csv')

In [13]:
df[app]

(87, 6)