# Load Libraries

In [None]:
import openai
import pandas as pd
import time

# Mount the drive and change the directory

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd drive/MyDrive/poleval_emotion/

Mounted at /content/drive/
/content/drive/MyDrive/poleval_emotion


# Determine constants

In [None]:
DATA_DIR = 'data/test-A/'
INPUT_TEXT_PATH = 'in_baseline.tsv'
OUTPUT_TEXT_PATH = 'in_gpt_corr.tsv'

# Load the TSV file

In [None]:
data = pd.read_csv(DATA_DIR + INPUT_TEXT_PATH, sep='\t', header=None)

In [None]:
# Calculate the word count and character count for each row
data['word_count'] = data[0].apply(lambda x: len(str(x).split()))
data['char_count'] = data[0].apply(lambda x: len(str(x)))

In [None]:
# Find the maximum word count and character count
max_word_count = data['word_count'].max()
max_char_count = data['char_count'].max()

In [None]:
print(f"Maximum word count in a row: {max_word_count}")
print(f"Maximum character count in a row: {max_char_count}")

Maximum word count in a row: 212
Maximum character count in a row: 1253


# Set OpenAI API key

In [None]:
openai.api_key = '<OPEN AI API KEY>' # needed for +1400 requests, +20 minutes, ~0.12$

# Function to correct text using GPT-4 in batches

In [None]:
def correct_text_gpt(text):
    try:
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {
                    "role": "system",
                    "content": "Correct the following text to proper Polish."
                },
                {
                    "role": "user",
                    "content": text
                }
            ]
        )

        corrected_text = response.choices[0].message.content
        return corrected_text

    except Exception as e:
        print(f"Error: {e}")
        return text  # In case of error, return the original text

# Correct the text iteratively

In [None]:
corrected_texts = []

In [None]:
for index, row in data.iterrows():
    text = row[0]
    corrected_text = correct_text_gpt(text)
    corrected_texts.append(corrected_text)

# Save the final corrected data

In [None]:
corrected_data = pd.DataFrame(corrected_texts)
corrected_file_path = DATA_DIR + OUTPUT_TEXT_PATH
corrected_data.to_csv(corrected_file_path, sep='\t', index=False, header=False)

In [None]:
print(f"Corrected data saved to {corrected_file_path}")

Corrected data saved to data/test-A/in_corrected.tsv
