In [1]:
import pandas as pd
import configparser
import sys
import os
import inspect
import evaluation as eval

# access parent directory from notebooks directory
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
config = configparser.ConfigParser()
# Read the configuration file
config.read('config.ini')
api_key_openai = config.get('credentials', 'api_key_openai')
api_key_mistral = config.get('credentials', 'api_key_mistral')
surfdrive_url_transcript_sentences = config.get('credentials', 'surfdrive_url_transcript_sentences')

output_evaluation_folder_path = 'output_evaluation/'

### Input

In [3]:
# read output of LLM
fileNoContext = 'Prompts_NoContext_complete_output.csv'
fileContext = 'Prompts_Context_complete_output.csv'

outputNoContext = pd.read_csv(output_evaluation_folder_path+fileNoContext)
outputContext = pd.read_csv(output_evaluation_folder_path+fileContext)
# read rewritten sentences by participants
transcriptSentences = pd.read_csv(surfdrive_url_transcript_sentences).reset_index()[['user', 'original', 'your_text']]

# merge two files
data_for_evaluation_noContext = pd.merge(outputNoContext, transcriptSentences, on=['user', 'original'])
data_for_evaluation_context = pd.merge(outputContext, transcriptSentences, on=['user', 'original'])
print(len(data_for_evaluation_noContext))
print(len(data_for_evaluation_context))

2400
2400


In [4]:
# check and remove empty files

print(data_for_evaluation_noContext['rewritten_sentence'].isna().sum().sum())
data_for_evaluation_noContext = data_for_evaluation_noContext.dropna(subset=['rewritten_sentence'])
print(len(data_for_evaluation_noContext))

print(data_for_evaluation_context['rewritten_sentence'].isna().sum().sum())
data_for_evaluation_context = data_for_evaluation_context.dropna(subset=['rewritten_sentence'])
print(len(data_for_evaluation_context))


41
2359
91
2309


### Evaluation Prompts


In [5]:
prompts_dict = {
    # Generic prompt
    'prompt_llm': 'You are an expert in text style transfer. Here is paragraph S1: {} ',
                 
    # Accuracy
    'prompt_accuracy_s2': 'and paragraph S2: {} ',
    'prompt_accuracy_inference': 'How different is the conversational style of paragraph S2 compared to S1 on a continuous scale from 1 (completely identical styles) to 10 (completely different styles)? Result = . Format result as "score" and "explanation".',
    
    # Content preservation
    'prompt_content_preservation_s2': 'and paragraph S2: {} ',
    'prompt_content_preservation_inference': 'How much does S1 preserve the content of S2 on a continuous scale from 1 (completely different topic) to 10 (identical topic)? Result = .Format result as "score" and "explanation".',
    
    # Fluency
    'prompt_fluency_inference': 'on a scale from 1 to 10 where 1 (lowest coherent) and 10 (highest coherent)? Result = Format result as "score" and "explanation".'
}




### Evaluation using Mistral

In [6]:
# No Context 
# Run00
for index, row in data_for_evaluation_noContext.iterrows():
    print(index)
    name= 'NoContext_' + format(index)
    evaluation_noContext_mistral = eval.get_evaluation_mistral(prompts_dict, row, name)

1078


KeyboardInterrupt: 

In [ ]:
# Context 
# Run00
for index, row in data_for_evaluation_context.iterrows():
    print(index)
    name= 'Context_' + format(index)
    evaluation_context_mistral = eval.get_evaluation_mistral(prompts_dict, row, name)

### Evaluation using GPT

In [None]:
# No Context 
# Run00
for index, row in data_for_evaluation_noContext.iterrows():
    print(index)
    name= 'NoContext_' + format(index)
    evaluation_noContext_gpt = eval.get_evaluation_gpt(prompts_dict, row, name)

In [7]:
# Context 
# Run00
for index, row in data_for_evaluation_context[2200:2400].iterrows():
    print(index)
    name= 'Context_' + format(index)
    evaluation_context_gpt = eval.get_evaluation_gpt(prompts_dict, row, name)

2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2311
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2325
2326
2327
2328
2329
2330
2331
2333
2335
2336
2337
2338
2339
2341
2342
2343
2344
2345
2346
2347
2349
2350
2351
2352
2353
2354
2355
2357
2358
2359
2360
2361
2362
2363
2364
2365
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
