In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path_root = '../Arg_similarity/'
similarity_model = 'sentence-transformers/all-MiniLM-L6-v2'

# echr texts that have been previously processed (pipeline output exists)
texts_processed = ['00', '04', '05', '06', '10', '13', '16', '20', '21']
texts_processed_with_manual_matches = ['04','06', '10', '13', '16']
texts_processed_long = ['01','02','17','19','27','29']

# paths
# these sheetnames are defined in the input xlsx
input_sheetname_original = 'original_'
input_sheetname_llm = 'llm_'
input_sheetname_manual = 'manual_'
input_sheetname_llm_reworked = 'llm1_'

# dict output path reworked prompts
dict_output_llm = {input_sheetname_llm : '',
                   input_sheetname_llm_reworked : 'reworked1/'
                   }

folder_output = path_root + 'output_arg_similarity_argBuf'
folder_feedback = path_root + 'feedback_arg_similarity/'

In [None]:
path_root = '../Arg_similarity/'
path_input = path_root + 'input_arg_similarity_argBuf.xlsx' # TODO change back

type_adu = ''
#type_adu = 'claims'
#type_adu = 'premises'

folder_output = path_root + f'output_arg_similarity_{type_adu}'
folder_feedback = path_root + f'feedback_arg_similarity_{type_adu}'
folder_output = path_root + 'output_arg_similarity_argueBuf'
folder_feedback = path_root + 'feedback_arg_similarity/'

In [5]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):

    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


def get_sentence_similarity(sentence1, sentence2):

    # Sentences we want sentence embeddings for
    sentences = [sentence1, sentence2]

    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained(similarity_model)
    model = AutoModel.from_pretrained(similarity_model)

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    cos = torch.nn.CosineSimilarity(dim=0)
    sim = cos(sentence_embeddings[0], sentence_embeddings[1])

    return sim.item()

In [6]:
def get_sentences(row, type_adu):
    sentence1 = row['arg_text']
    sentence2 = row['matched_text']

    claim_sim = 0.0
    premise_sim = 0.0

    # Calculate claim similarity if needed or not specified
    if type_adu == '' or type_adu == 'claims':
        claim_sentence1 = sentence1.split('\n')[-1] if '\n' in sentence1 else sentence1
        claim_sentence2 = sentence2.split('\n')[-1] if '\n' in sentence2 else sentence2
        if claim_sentence1 and claim_sentence2:
            claim_sim = get_sentence_similarity(claim_sentence1, claim_sentence2)

    # Calculate premises similarity if needed or not specified
    if type_adu == '' or type_adu == 'premises':
        premise_sentence1 = sentence1[:sentence1.rindex('\n')] if '\n' in sentence1 else ""
        premise_sentence2 = sentence2[:sentence2.rindex('\n')] if '\n' in sentence2 else ""
        if premise_sentence1 and premise_sentence2:
            premise_sim = get_sentence_similarity(premise_sentence1, premise_sentence2)

    return claim_sim, premise_sim

In [7]:
def convert_to_percent_str(value):

  rounded = round(value * 100, 4)

  return str(rounded) + '%'

In [8]:
def get_similarity_alpha(claim_sim, premise_sim, alpha):
  return alpha * premise_sim + (1 - alpha) * claim_sim


def get_similarity_alpha_all(texts, llm_sheets, threshold, alphas, type_adu):

  for t in texts:

    print(f'Text: {t}')

    for llm_sheet in llm_sheets:

      print(" > > > > > > > > ")
      print(f'LLM sheet: {llm_sheet}')
      sheetname_llm = llm_sheet + t
      #data_llm = pd.read_excel(path_input, sheet_name=sheetname_llm)

      output_path = f'{folder_output}/{threshold}/{dict_output_llm[llm_sheet]}'
      export_path_end = '.xlsx'
      type_comp = '_original'#'_original  _llm'

      llm_matches = pd.read_excel(output_path + t + type_comp + '_matches'  + export_path_end)
      #llm_matches = llm_matches[llm_matches['similarity'] >= 0.75]
      llm_matches[['claim', 'premise']] = llm_matches.apply(lambda x: get_sentences(x, type_adu), axis=1, result_type='expand')

      for i in alphas:
        print(f'Alpha: {i}')
        llm_matches[f'similarity_alpha_{i}'] = llm_matches.apply(lambda x: get_similarity_alpha(x['claim'], x['premise'], i), axis=1)
        grupo = llm_matches.groupby('arg_num')[f'similarity_alpha_{i}'].idxmax()
        llm_matches_unique = llm_matches.loc[grupo].reset_index(drop=True)
        print(f'Mean: {convert_to_percent_str(llm_matches_unique[f"similarity_alpha_{i}"].mean())}')
        #print(f'Max: {convert_to_percent_str(llm_matches_unique[f"similarity_alpha_{i}"].max())}')
        #print(f'Min: {convert_to_percent_str(llm_matches_unique[f"similarity_alpha_{i}"].min())}')

      llm_matches_unique.to_excel(output_path + t + type_comp + '_matches_unique' + export_path_end, index=False)
      #return llm_matches_unique

In [None]:
#get_similarity_alpha_all(texts_processed, [input_sheetname_llm_reworked], 0.60, [0.3, 0.4, 0.5, 0.6, 0.7], "")
get_similarity_alpha_all(texts_processed, [input_sheetname_llm], 0.75, [0.7], "")