In [3]:
import pandas as pd
from openai import OpenAI
import glob
import os
from wandb.sdk.data_types.trace_tree import Trace
import wandb
import configparser
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage
import sys
import os
import inspect
import json
import random
# access parent directory from notebooks directory
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [4]:
config = configparser.ConfigParser()
# Read the configuration file
config.read('config.ini')
api_key_openai = config.get('credentials', 'api_key_openai')
api_key_mistral = config.get('credentials', 'api_key_mistral')
surfdrive_url_input_sentences = config.get('credentials', 'surfdrive_url_input_sentences')
surfdrive_url_prompts = config.get('credentials', 'surfdrive_url_prompts')
output_parallel_data = 'output_parallel_data/'
output_llm_folder_path = 'output_llm_data/'

In [5]:
df_prompts = pd.read_csv(surfdrive_url_prompts,sep=';').reset_index()
df_prompts

Unnamed: 0,index,promptID,prompt_system_content,prompt_x_shot_template,prompt_content_addition,model
0,0,0,\n[INST]You are an expert in text style transf...,Here is a sentence written without any style: ...,\n#####\n\nHere is a sentence written without ...,mistral
1,1,1,You are an expert in text style transfer. You ...,Here is a sentence written without any style: ...,\n#####\n\nHere is a sentence written without ...,gpt


In [6]:
input_sentences = pd.read_csv(surfdrive_url_input_sentences,sep=';')['sentences']
input_sentences = input_sentences[0:6]
input_sentences

0    I’m all about that food. I usually kick off th...
1    Just getting my vitamins in at the school cant...
2    De Pizzabakkers sell this vegan pizza with che...
3    I entered the world of vegan foods lately. Nex...
4    This vegan fried chicken from KFC is on the sp...
5    Just having this vegan hotdog from the school ...
Name: sentences, dtype: object

In [15]:
def extract_info(row):
    try:
        # Preprocess the string to replace double backslashes with a single backslash
        cleaned_row = row.replace('\\', '').replace('\\\\', '')
        json_data = json.loads(cleaned_row)
        return pd.Series({
            'rewritten_sentence': json_data.get('rewritten_sentence', ''),
            'explanation': json_data.get('explanation', '')
        })
    except json.JSONDecodeError:
        return pd.Series({
            'rewritten_sentence': '',
            'explanation': ''
        })

### Mistral Run

In [None]:
csv_files = glob.glob(output_parallel_data + '*')   
usernames= []
num_sentences = []

mistral_models = ["mistral-small","mistral-medium"]
prompt_id = str(df_prompts['promptID'].iloc[0])
mistral_prompt_system_content = df_prompts['prompt_system_content'].iloc[0]
mistral_prompt_x_shot_template = df_prompts['prompt_x_shot_template'].iloc[0]
mistral_prompt_content_addition = df_prompts['prompt_content_addition'].iloc[0]

for file in csv_files:
    if file.endswith('parallel_data_mistral_medium.csv'):
        username = file[21:23]
        # prepare input data
        df = pd.read_csv(file)
        # df.sample(n=5),df.sample(n=10)
        lst_x_shots = [df.sample(n=3),df.sample(n=5),df.sample(n=10)]
        for df_shots in lst_x_shots: 
            x_shots_list = []
            messages_id = []
            #save the shots dataset to csv
            df_shots[['messageID','rewritten_sentence','original']].to_csv(output_llm_folder_path + "input_dataframe_user_" + username + "_promptID_" + prompt_id  + '_shots_' + str(df_shots.shape[0])+'.csv',index=False)
            for mistral_m in mistral_models:
                run_id = str(random.randint(100000, 999999))
                print("run_id_" + run_id + "_user_" + username + "_promptID_" + prompt_id + '_model_'+ mistral_m + '_shots_' + str(df_shots.shape[0]))
                final_output = []
                for index, row in df_shots.iterrows():
                    # Access values in the desired order and append to the list
                    x_shots_list.append(row['rewritten_sentence'])
                    x_shots_list.append(row['original'])  
                    messages_id.append(row['messageID'])
    
                # Create the final formatted string
                formatted_string = mistral_prompt_system_content + '\n'
                for i in range(0, len(x_shots_list), 2):
                    formatted_string += mistral_prompt_x_shot_template.format(x_shots_list[i], x_shots_list[i + 1]) + "\n\n"
    
                formatted_string += mistral_prompt_content_addition
                # Display the final formatted string
                
                for i in range(0,len(input_sentences)-1):
                    query = f"{formatted_string.replace('{}', f'{{{input_sentences[i]}}}')}"
                    # print('Query ',i,query,'\n')
                    messages = [ ChatMessage(role = "user", content = query) ]
                    
                    # No streaming
                    chat_response = mistral_client.chat(
                        model = mistral_m,
                        messages = messages,
                    )
                    
                    final_output.append({'original': input_sentences[i],'rewritten_sentence': extract_info(chat_response.choices[0].message.content)['rewritten_sentence'],'explanation' : extract_info(chat_response.choices[0].message.content)['explanation'], 'output': chat_response.choices[0].message.content,"query":query, "model": chat_response.model, "prompt_tokens" : chat_response.usage.prompt_tokens,"completion_tokens" : chat_response.usage.completion_tokens,"object" : chat_response.object, "promptID" : prompt_id})
            
                df_mistral_output = pd.DataFrame(final_output)
                df_mistral_output.to_csv(output_llm_folder_path +"run_id_" + run_id +  "_user_" + username + "_promptID_" + prompt_id + '_model_'+ mistral_m + '_shots_' + str(df_shots.shape[0]) + '_output.csv', index=False)
                
                wandb.init(project="lmm-evaluate", name="run_id_" + run_id + "_user_" + username + "_promptID_" + prompt_id + '_model_'+ mistral_m+ '_shots_' + str(df_shots.shape[0]))
                # log df as a table to W&B for interactive exploration
                wandb.log({"run_id_" + run_id + "promptID_" + prompt_id + '_model'+ mistral_m: wandb.Table(dataframe = df_mistral_output)})
                # log csv file as an dataset artifact to W&B for later use
                artifact = wandb.Artifact('df_' +"run_id_" + run_id + "promptID_" + prompt_id + '_model_'+ mistral_m + '_shots_' + str(df_shots.shape[0]) + '_output', type="dataset")
                artifact.add_file(output_llm_folder_path +"run_id_" + run_id + "_user_" + username + "_promptID_" + prompt_id + '_model_'+ mistral_m + '_shots_' + str(df_shots.shape[0]) + '_output.csv')
                wandb.log_artifact(artifact)
                wandb.finish()

### GPT Run

In [8]:
prompt_id = str(df_prompts['promptID'].iloc[1])
gpt_prompt_system_content = df_prompts['prompt_system_content'].iloc[1]
gpt_prompt_x_shot_template = df_prompts['prompt_x_shot_template'].iloc[1]
gpt_prompt_content_addition = df_prompts['prompt_content_addition'].iloc[1]
prompt_id

'1'

In [None]:
csv_files = glob.glob(output_parallel_data + '*') 

gpt_client = OpenAI(api_key = api_key_openai)
gpt_models = ["gpt-4"]
gpt_temperature = 0.2
gpt_max_tokens = 256
gpt_frequency_penalty = 0.0

prompt_id = str(df_prompts['promptID'].iloc[1])
gpt_prompt_system_content = df_prompts['prompt_system_content'].iloc[1]
gpt_prompt_x_shot_template = df_prompts['prompt_x_shot_template'].iloc[1]
gpt_prompt_content_addition = df_prompts['prompt_content_addition'].iloc[1]

for file in csv_files:
    if file.endswith('parallel_data_mistral_medium.csv'):
        username = file[21:23]
        # prepare input data
        df = pd.read_csv(file)
        # df.sample(n=3),df.sample(n=5),df.sample(n=10)
        lst_x_shots = [df.sample(n=3)]
        for df_shots in lst_x_shots: 
            x_shots_list = []
            messages_id = []
            #save the shots dataset to csv
            df_shots[['messageID','rewritten_sentence','original']].to_csv(output_llm_folder_path + "input_dataframe_user_" + username + "_promptID_" + prompt_id  + '_shots_' + str(df_shots.shape[0])+'.csv',index=False)
            for gpt_m in gpt_models:
                run_id = str(random.randint(100000, 999999))
                print("run_id_" + run_id + "_user_" + username + "_promptID_" + prompt_id + '_model_'+ gpt_m + '_shots_' + str(df_shots.shape[0]))
                final_output = []
                for index, row in df_shots.iterrows():
                    # Access values in the desired order and append to the list
                    x_shots_list.append(row['rewritten_sentence'])
                    x_shots_list.append(row['original'])  
                    messages_id.append(row['messageID'])
    
                # Create the query formatted string
                formatted_string = ''
                for i in range(0, len(x_shots_list), 2):
                    formatted_string += gpt_prompt_x_shot_template.format(x_shots_list[i], x_shots_list[i + 1]) + "\n\n"
    
                formatted_string += gpt_prompt_content_addition

                for i in range(0,len(input_sentences)-1):
                    query = f"{formatted_string.replace('{}', f'{{{input_sentences[i]}}}')}"
                    # print('Query ',i,query,'\n')
                    message=[{"role": "system", "content": gpt_prompt_system_content}, {"role": "user", "content":query}]

                    
                    # No streaming
                    chat_response = gtp_client.chat.completions.create(
                        model = gpt_m,
                        messages = message,
                        temperature = gpt_temperature,
                        max_tokens = gpt_max_tokens,
                        frequency_penalty = gpt_frequency_penalty
                    )
                    
                    final_output.append({'original': input_sentences[i],
                                         'rewritten_sentence': extract_info(chat_response.choices[0].message.content)['rewritten_sentence'],
                                         'explanation' : extract_info(chat_response.choices[0].message.content)['explanation'],
                                         'output': chat_response.choices[0].message.content,
                                         "query":query,
                                         "model": chat_response.model,
                                         "prompt_tokens" : chat_response.usage.prompt_tokens,
                                         "completion_tokens" : chat_response.usage.completion_tokens,
                                         "object" : chat_response.object,
                                         "promptID" : prompt_id,
                                         "temperature": gpt_temperature})
            
                df_gpt_output = pd.DataFrame(final_output)
                df_gpt_output.to_csv(output_llm_folder_path +"run_id_" + run_id +  "_user_" + username + "_promptID_" + prompt_id + '_model_'+ gpt_m + '_shots_' + str(df_shots.shape[0]) + '_output.csv', index=False)
                
                wandb.init(project="lmm-evaluate", name="run_id_" + run_id + "_user_" + username + "_promptID_" + prompt_id + '_model_'+ gpt_m + '_shots_' + str(df_shots.shape[0]))
                # log df as a table to W&B for interactive exploration
                wandb.log({"run_id_" + run_id + "promptID_" + prompt_id + '_model'+ gpt_m: wandb.Table(dataframe = df_gpt_output)})
                # log csv file as an dataset artifact to W&B for later use
                artifact = wandb.Artifact('df_' +"run_id_" + run_id + "promptID_" + prompt_id + '_model_'+ gpt_m + '_shots_' + str(df_shots.shape[0]) + '_output', type="dataset")
                artifact.add_file(output_llm_folder_path +"run_id_" + run_id + "_user_" + username + "_promptID_" + prompt_id + '_model_'+ gpt_m + '_shots_' + str(df_shots.shape[0]) + '_output.csv')
                wandb.log_artifact(artifact)
                wandb.finish()

run_id_421075_user_U3_promptID_1_model_gpt-4_shots_3


2024-02-08 17:45:59,601 ERROR wandb.jupyter: Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mbojan-2110[0m. Use [1m`wandb login --relogin`[0m to force relogin




run_id_523617_user_U7_promptID_1_model_gpt-4_shots_3




run_id_129149_user_U8_promptID_1_model_gpt-4_shots_3




run_id_314964_user_U0_promptID_1_model_gpt-4_shots_3




run_id_443633_user_U4_promptID_1_model_gpt-4_shots_3




run_id_487138_user_U6_promptID_1_model_gpt-4_shots_3




run_id_995565_user_U2_promptID_1_model_gpt-4_shots_3




run_id_612652_user_U5_promptID_1_model_gpt-4_shots_3




run_id_283412_user_U1_promptID_1_model_gpt-4_shots_3




run_id_376292_user_U9_promptID_1_model_gpt-4_shots_3




In [16]:
chat_response.choices[0].message.content

'"Man, I\'m all about that food. Like, every week, I start off with some beans and vegan sausage right before school. And then, bam, I\'m off to school in no time."'

In [17]:
gpt_prompt_system_content

'You are an expert in text style transfer. You will be given few examples of a conversational style of person X,\\nand the corresponding sentences written without any style.\\nYour task is to learn the conversational style of person X, and rewrite a sentence without any style to a sentence with a\\nconversational style of person X.\\n\\nThe output needs to be formated as a valid JSON object with the following fields: rewritten_sentence, explanation \\n#####\\n'