## 2.1 Select subset of romance languages from dataset

From the processed dataset that contains english sentences, speakers of the following languages are selected: Spanish, Catalan, Basque, Galician, Italian and French

In [1]:
import pandas as pd
import numpy as np
from happytransformer import HappyTextToText, TTSettings
from tqdm import tqdm
from transformers import AutoTokenizer, T5ForConditionalGeneration, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
#suppress warning
pd.options.mode.chained_assignment = None  # default='warn'

In [9]:
csv_file = pd.read_csv("./data/english_only_refined.csv", encoding='utf-8')
ds = csv_file[['id','native','language','text']]

# merge same categories
ds['native'].mask(ds['native'] == 'Basque<br/>Spanish', 'Spanish<br/>Basque', inplace=True)
ds['native'].mask(ds['native'] == 'Catalan<br/>Spanish', 'Spanish<br/>Catalan', inplace=True)
ds['native'].mask(ds['native'] == 'Galician<br/>Spanish', 'Spanish<br/>Galician', inplace=True)
ds['native'].mask(ds['native'] == 'Italian<br/>Spanish', 'Spanish<br/>Italian', inplace=True)


# filtering by language
la_list = ['Spanish<br/>Catalan','Spanish<br/>Galician',
'Spanish<br/>Basque','Spanish<br/>Italian', 'Italian', 'French','Spanish']
mask = ds['native'].isin(la_list)
dataset_selection = ds[mask]

dataset_selection.to_csv('./data/selected_subset_clean.csv')

In [10]:
dataset_selection[['native','text']].head(5)

Unnamed: 0,native,text
1,Spanish<br/>Catalan,I'm looking for a language exchange to improve...
3,Italian,i am here for learn english and i help you wit...
15,Spanish<br/>Catalan,I am from Barcelona and I speak Spanish and Ca...
22,Spanish,"Hey, I'm interested in improving my English (I..."
30,Spanish,"Hi, I'm trying to improve my english conversat..."


## 2.2 Compute correct sentences

In [None]:
tokenizer = AutoTokenizer.from_pretrained("grammarly/coedit-large")
model = pipeline("text2text-generation", model="grammarly/coedit-large", tokenizer=tokenizer, max_length=250, device=0)

11/09/2023 09:02:36 - INFO - happytransformer.happy_transformer -   Using device: mps
Processing texts:   0%|          | 0/1887 [00:00<?, ?it/s]11/09/2023 09:02:37 - INFO - happytransformer.happy_transformer -   Moving model to mps
11/09/2023 09:02:37 - INFO - happytransformer.happy_transformer -   Initializing a pipeline
Processing texts:  34%|███▍      | 639/1887 [18:59<24:43,  1.19s/it]  

In [None]:
df = pd.read_csv('./data/selected_subset_clean.csv')

In [None]:
def fix_error(x):
    input_text = "Fix grammatical errors in this sentence:"+x
    outputs = model(input_text)
    return outputs[0]['generated_text']

df['corrected'] = df.apply(lambda x: fix_error(x['original']), axis=1)

In [None]:
df[['original','corrected','native']].to_csv('data_corrected.csv')