# Grammatical error correction - LLMs

This notebook contains the steps taken to process the dataset into sentences which were then turned into different prompts used to query the models. Each model was queried through the OpenAI SDK and API and all the results were saved in the form of JSON file in order to keep the sentences in order wich would've been unnecessarily arduous with simple .txt files. Every set of predictions was then evaluated with ROUGE, BLEU and F0.5 metrics.

In [1]:
!wget https://raw.githubusercontent.com/kanekomasahiro/bert-gec/master/scripts/convert_m2_to_parallel.py
!pip install openai

import shutil
import os
from openai import OpenAI
from google.colab import userdata, drive, files

!pip install sacrebleu > /dev/null
import sacrebleu

!pip install torchmetrics > /dev/null
from torchmetrics.functional.text.rouge import rouge_score

from pprint import pprint
import json

!pip install errant > /dev/null

!python3 -m spacy download en_core_web_sm > /dev/null

--2024-04-29 05:49:47--  https://raw.githubusercontent.com/kanekomasahiro/bert-gec/master/scripts/convert_m2_to_parallel.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1599 (1.6K) [text/plain]
Saving to: ‘convert_m2_to_parallel.py’


2024-04-29 05:49:47 (23.3 MB/s) - ‘convert_m2_to_parallel.py’ saved [1599/1599]

Collecting openai
  Downloading openai-1.23.6-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.6/311.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
shutil.copy("/content/drive/MyDrive/INF8225_projet/official-2014.combined.m2", "/content/")

'/content/official-2014.combined.m2'

# Data Processing


In [4]:
!python convert_m2_to_parallel.py official-2014.combined.m2 output_src.txt output_tgt.txt

In [5]:
sentences = []

with open('/content/output_src.txt', 'r') as f:
  for line in f:
    sentences.append(line.strip())

promptsMinimal = [[{'role': 'system', 'content' : f'Correct the grammatical errors in the following sentence: {sentence}; output:'}] for sentence in sentences]
promptsOpen1 = [[{'role': 'system', 'content' : f'Revise mistakes in this text: {sentence}; output:'}] for sentence in sentences]
promptsOpen2 = [[{'role': 'system', 'content' : f'Rewrite the following text with proper grammar: {sentence}; output:'}] for sentence in sentences]
promptsTool =  [[{'role': 'system', 'content' : f"""You are a grammatical error correction tool. Your task is to correct the grammaticality and spelling in
               the input sentence. Make the smallest possible change in order to make the sentence grammatically
               correct. Change as few words as possible. Do not rephrase parts of the sentence that are already
               grammatical. Do not change the meaning of the sentence by adding or removing information. If the
               sentence is already grammatically correct, you should output the original sentence without changing
               anything. \n\nInput sentence: {sentence}\nOutput sentence:"""}] for sentence in sentences]
promptsToolFewShot = [[
          {'role': 'system',
           'content': """You are a grammatical error correction tool. Your task is to correct the grammaticality and spelling in
                      the input sentence. Make the smallest possible change in order to make the sentence grammatically
                      correct. Change as few words as possible. Do not rephrase parts of the sentence that are already
                      grammatical. Do not change the meaning of the sentence by adding or removing information. If the
                      sentence is already grammatically correct, you should output the original sentence without changing
                      anything."""},
          {'role': 'user', 'content': 'Input sentence:  I love this sport. I look forward to the weakened, to go out with my bike and my group of friends.'},
          {'role': 'assistant', 'content': 'Corrected sentence: I love this sport. I look forward to the weekend to go out with my bike and my group of friends.'},
          {'role': 'user', 'content': 'Input sentence:  Lucy Keyes was the last thriller I’ve seen.'},
          {'role': 'assistant', 'content': 'Corrected sentence: Lucy Keyes was the last thriller I saw'},
          {'role': 'user', 'content': 'Input sentence:  In the biggest cities around the world the traffic nonstop and increase every day.'},
          {'role': 'assistant', 'content': 'Corrected sentence: In the biggest cities around the world, the traffic is nonstop and increasing every day.'},
          {'role': 'user', 'content': 'Input sentence:  Also, the satisfaction of the customers pushes me to work harder and be better at my job.'},
          {'role': 'assistant', 'content': 'Corrected sentence: Also, the satisfaction of the customers pushes me to work harder and be better at my job.'},
          {'role': 'user', 'content': f'Input sentence:  {sentence}'}] for sentence in sentences]

# Querying the models

In [None]:
clientGPT = OpenAI(api_key=userdata.get('gpt_api_key'))

modelsGPT = ['gpt-4-turbo', 'gpt-4', 'gpt-3.5-turbo']

anyscale_client = OpenAI(
    base_url = "https://api.endpoints.anyscale.com/v1",
    api_key = userdata.get('anyscale_key')
)

modelsAnyscale = ['meta-llama/Meta-Llama-3-8B-Instruct', 'mistralai/Mistral-7B-Instruct-v0.1', 'mistralai/Mixtral-8x7B-Instruct-v0.1']

models = modelsGPT + modelsAnyscale

In [None]:
outputsZeroShot = {model: {} for model in models}
outputsFewShot = {model: {} for model in models}

for i in range(len(sentences)):
  for model in modelsGPT:
    responseZeroShot = clientGPT.chat.completions.create(
        model=model,
        messages= promptsMinimal[i],
    )
    outputsZeroShot[model][i] = responseZeroShot.choices[0].message.content

    if model == 'gpt-4-turbo':
      responseFewShot = clientGPT.chat.completions.create(
          model=model,
          messages= promptsToolFewShot[i],
      )
      outputsFewShot[model][i] = responseFewShot.choices[0].message.content

  for model in modelsAnyscale:
    responseZeroShot = anyscale_client.chat.completions.create(
        model=model,
        messages= promptsMinimal[i],
    )
    outputsZeroShot[model][i] = responseZeroShot.choices[0].message.content

    responseFewShot = anyscale_client.chat.completions.create(
        model=model,
        messages= promptsToolFewShot[i],
    )
    outputsFewShot[model][i] = responseFewShot.choices[0].message.content

  # Save every 100 sentences in case the process stops to avoid losing all
  # progress as this takes a few hours
  if i % 100 == 0:
    print(f'Processed {i} sentences')
    for model in modelsGPT:
      with open(f'outputsZeroShot_{model}.json', 'w') as f:
        json.dump(outputsZeroShot[model], f)
      with open(f'outputsFewShot_{model}.json', 'w') as f:
        json.dump(outputsFewShot[model], f)
    for model in modelsAnyscale:
      with open(f'outputsZeroShot_{model.split("/")[-1]}.json', 'w') as f:
        json.dump(outputsZeroShot[model], f)
      with open(f'outputsFewShot_{model.split("/")[-1]}.json', 'w') as f:
        json.dump(outputsFewShot[model], f)

In [None]:
for model in modelsGPT:
  with open(f'outputsZeroShot_{model}.json', 'w') as f:
    json.dump(outputsZeroShot[model], f)
  shutil.copy(f'/content/outputsZeroShot_{model}.json', '/content/drive/MyDrive/INF8225_projet/')
  if model == 'gpt-4-turbo':
    with open(f'outputsFewShot_{model}.json', 'w') as f:
      json.dump(outputsFewShot[model], f)
    shutil.copy(f'/content/outputsFewShot_{model}.json', '/content/drive/MyDrive/INF8225_projet/')
for model in modelsAnyscale:
  with open(f'outputsZeroShot_{model.split("/")[-1]}.json', 'w') as f:
    json.dump(outputsZeroShot[model], f)
  shutil.copy(f'/content/outputsZeroShot_{model.split("/")[-1]}.json', '/content/drive/MyDrive/INF8225_projet/')
  with open(f'outputsFewShot_{model.split("/")[-1]}.json', 'w') as f:
    json.dump(outputsFewShot[model], f)
  shutil.copy(f'/content/outputsFewShot_{model.split("/")[-1]}.json', '/content/drive/MyDrive/INF8225_projet/')

# Evaluation

In [22]:
predsZeroShot = { model: [] for model in models }
fewShotModels = modelsAnyscale + ['gpt-4-turbo']
predsFewShot = { model: [] for model in fewShotModels }

for model in modelsGPT:
  with open(f'/content/drive/MyDrive/INF8225_projet/outputsZeroShot_{model}.json', 'r') as f:
    data = json.load(f)
    with open(f'/content/outputsZeroShot_{model}.txt', 'w') as f:
      for line in data.values():
        predsZeroShot[model].append(line)
        f.write(line.replace('\n', '\\n'))
        f.write('\n')
  if model == 'gpt-4-turbo':
    with open(f'/content/drive/MyDrive/INF8225_projet/outputsFewShot_{model}.json', 'r') as f:
      data = json.load(f)
      with open(f'/content/outputsFewShot_{model}.txt', 'w') as f:
        for line in data.values():
          predsFewShot[model].append(line)
          f.write(line.replace('\n', '\\n'))
          f.write('\n')

for model in modelsAnyscale:
  with open(f'/content/drive/MyDrive/INF8225_projet/outputsZeroShot_{model.split("/")[-1]}.json', 'r') as f:
    data = json.load(f)
    with open(f'/content/outputsZeroShot_{model.split("/")[-1]}.txt', 'w') as f:
      for line in data.values():
        predsZeroShot[model].append(line)
        f.write(line.replace('\n', '\\n'))
        f.write('\n')
  with open(f'/content/drive/MyDrive/INF8225_projet/outputsFewShot_{model.split("/")[-1]}.json', 'r') as f:
    data = json.load(f)
    with open(f'/content/outputsFewShot_{model.split("/")[-1]}.txt', 'w') as f:
      for line in data.values():
        predsFewShot[model].append(line)
        f.write(line.replace('\n', '\\n'))
        f.write('\n')

In [17]:
truths = None
with open('/content/drive/MyDrive/INF8225_projet/output_tgt.txt', 'r') as f:
  truths = f.readlines()

In [23]:
for model, preds in predsZeroShot.items():
  print(f'Model: {model} (Zero-shot)')
  print(f'Rouge: {rouge_score(preds, truths)}')
  print(f'Bleu: {sacrebleu.corpus_bleu(preds, [truths])}')
  print()

for model, preds in predsFewShot.items():
  print(f'Model: {model} (Few-shot)')
  print(f'Rouge: {rouge_score(preds, truths)}')
  print(f'Bleu: {sacrebleu.corpus_bleu(preds, [truths])}')
  print()

Model: gpt-4-turbo (Zero-shot)
Rouge: {'rouge1_fmeasure': tensor(0.8940), 'rouge1_precision': tensor(0.8937), 'rouge1_recall': tensor(0.8976), 'rouge2_fmeasure': tensor(0.8015), 'rouge2_precision': tensor(0.8011), 'rouge2_recall': tensor(0.8051), 'rougeL_fmeasure': tensor(0.8885), 'rougeL_precision': tensor(0.8882), 'rougeL_recall': tensor(0.8921), 'rougeLsum_fmeasure': tensor(0.8885), 'rougeLsum_precision': tensor(0.8882), 'rougeLsum_recall': tensor(0.8920)}
Bleu: BLEU = 73.71 88.6/78.0/69.4/61.8 (BP = 0.999 ratio = 0.999 hyp_len = 30274 ref_len = 30304)

Model: gpt-4 (Zero-shot)
Rouge: {'rouge1_fmeasure': tensor(0.8922), 'rouge1_precision': tensor(0.8916), 'rouge1_recall': tensor(0.8961), 'rouge2_fmeasure': tensor(0.7988), 'rouge2_precision': tensor(0.7982), 'rouge2_recall': tensor(0.8026), 'rougeL_fmeasure': tensor(0.8863), 'rougeL_precision': tensor(0.8857), 'rougeL_recall': tensor(0.8902), 'rougeLsum_fmeasure': tensor(0.8863), 'rougeLsum_precision': tensor(0.8857), 'rougeLsum_reca

In [24]:
!errant_parallel -orig /content/drive/MyDrive/INF8225_projet/output_src.txt -cor /content/drive/MyDrive/INF8225_projet/output_tgt.txt -out truth.m2

for model in modelsGPT:
  print(f'Model: {model}')
  !errant_parallel -orig /content/drive/MyDrive/INF8225_projet/output_src.txt -cor /content/outputsZeroShot_{model}.txt -out preds_{model}.m2
  !errant_compare -hyp preds_{model}.m2 -ref truth.m2 -dt
  !errant_compare -hyp preds_{model}.m2 -ref truth.m2

  if model == 'gpt-4-turbo':
    !errant_parallel -orig /content/drive/MyDrive/INF8225_projet/output_src.txt -cor /content/outputsFewShot_{model}.txt -out preds_{model}.m2
    !errant_compare -hyp preds_{model}.m2 -ref truth.m2 -dt
    !errant_compare -hyp preds_{model}.m2 -ref truth.m2

for model in modelsAnyscale:
  print(f'Model: {model}')
  !errant_parallel -orig /content/drive/MyDrive/INF8225_projet/output_src.txt -cor /content/outputsZeroShot_{model.split("/")[-1]}.txt -out preds_{model.split("/")[-1]}.m2
  !errant_compare -hyp preds_{model.split("/")[-1]}.m2 -ref truth.m2 -dt
  !errant_compare -hyp preds_{model.split("/")[-1]}.m2 -ref truth.m2

  !errant_parallel -orig /content/drive/MyDrive/INF8225_projet/output_src.txt -cor /content/outputsFewShot_{model.split("/")[-1]}.txt -out preds_{model.split("/")[-1]}.m2
  !errant_compare -hyp preds_{model.split("/")[-1]}.m2 -ref truth.m2 -dt
  !errant_compare -hyp preds_{model.split("/")[-1]}.m2 -ref truth.m2

Loading resources...
Processing parallel files...
Model: gpt-4-turbo
Loading resources...
Processing parallel files...

TP	FP	FN	Prec	Rec	F0.5
2606	5462	2538	0.323	0.5066	0.3482


TP	FP	FN	Prec	Rec	F0.5
1171	4083	3124	0.2229	0.2726	0.2313

Loading resources...
Processing parallel files...

TP	FP	FN	Prec	Rec	F0.5
2560	5369	2584	0.3229	0.4977	0.3473


TP	FP	FN	Prec	Rec	F0.5
1173	3964	3122	0.2283	0.2731	0.2361

Model: gpt-4
Loading resources...
Processing parallel files...

TP	FP	FN	Prec	Rec	F0.5
2520	5324	2624	0.3213	0.4899	0.345


TP	FP	FN	Prec	Rec	F0.5
1135	3916	3160	0.2247	0.2643	0.2316

Model: gpt-3.5-turbo
Loading resources...
Processing parallel files...

TP	FP	FN	Prec	Rec	F0.5
2169	5115	2975	0.2978	0.4217	0.3164


TP	FP	FN	Prec	Rec	F0.5
947	3598	3348	0.2084	0.2205	0.2107

Model: meta-llama/Meta-Llama-3-8B-Instruct
Loading resources...
Processing parallel files...

TP	FP	FN	Prec	Rec	F0.5
2435	6624	2709	0.2688	0.4734	0.2942


TP	FP	FN	Prec	Rec	F0.5
838	5895	3457	0.1245	0.1951	0.1342