In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd '/content/drive/MyDrive/Data Science Project/Neural-machine-translation/Tatoeba'

/content/drive/MyDrive/Data Science Project/Neural-machine-translation/Tatoeba


In [3]:
import os
source_language = "en"
target_language = "yo" 
lc = False  
seed = 42  
tag = "baseline" 

os.environ["src"] = source_language # Sets them in bash as well, since we often use bash scripts
os.environ["tgt"] = target_language
os.environ["tag"] = tag

# This will save it to a folder in our gdrive instead!
!mkdir -p "/content/drive/MyDrive/Data Science Project/Neural-machine-translation/Tatoeba/$src-$tgt-$tag"
os.environ["gdrive_path"] = "/content/drive/MyDrive/Data Science Project/Neural-machine-translation/Tatoeba/%s-%s-%s" % (source_language, target_language, tag)

In [4]:
!echo $gdrive_path

/content/drive/MyDrive/Data Science Project/Neural-machine-translation/Tatoeba/en-yo-baseline


In [5]:
# # Install opus-tools
# ! pip install opustools-pkg

In [6]:
# # Downloading our corpus
# ! opus_read -d Tatoeba -s $src -t $tgt -wm moses -w tatoeba.$src tatoeba.$tgt -q

# # extract the corpus file
# ! gunzip Tatoeba_latest_xml_$src-$tgt.xml.gz

# # Downloading our corpus
# !opus_read -d GlobalVoices -s $src -t $tgt -wm moses -w globalvoices.$src globalvoices.$tgt -q

# # extract the corpus file
# !gunzip GlobalVoices_latest_xml_$src-$tgt.xml.gz

In [7]:
# # Download the global test set.
# ! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-any.en
  
# # And the specific test set for this language pair.
# os.environ["trg"] = target_language 
# os.environ["src"] = source_language 

# ! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-$trg.en 
# ! mv test.en-$trg.en test.en
# ! wget https://raw.githubusercontent.com/juliakreutzer/masakhane/master/jw300_utils/test/test.en-$trg.$trg 
# ! mv test.en-$trg.$trg test.$trg

In [8]:
# Read the test data to filter from train and dev splits.
# Store english portion in set for quick filtering checks.
en_test_sents = set()
filter_test_sents = "test.en-any.en"
j = 0
with open(filter_test_sents) as f:
  for line in f:
    en_test_sents.add(line.strip())
    j += 1
print('Loaded {} global test sentences to filter from the training/dev data.'.format(j))

Loaded 3571 global test sentences to filter from the training/dev data.


In [9]:
import pandas as pd

# TMX file to dataframe
source_file = 'globalvoices.' + source_language
target_file = 'globalvoices.' + target_language

source = []
target = []
skip_lines = []  # Collect the line numbers of the source portion to skip the same lines for the target portion.
with open(source_file) as f:
    for i, line in enumerate(f):
        # Skip sentences that are contained in the test set.
        if line.strip() not in en_test_sents:
            source.append(line.strip())
        else:
            skip_lines.append(i)             
with open(target_file) as f:
    for j, line in enumerate(f):
        # Only add to corpus if corresponding source was not skipped.
        if j not in skip_lines:
            target.append(line.strip())
    
print('Loaded data and skipped {}/{} lines since contained in test set.'.format(len(skip_lines), i))
    
df = pd.DataFrame(zip(source, target), columns=['source_sentence', 'target_sentence'])
df.head(3)

Loaded data and skipped 0/132 lines since contained in test set.


Unnamed: 0,source_sentence,target_sentence
0,An all-female flight crew makes history in Moz...,Òṣìṣẹ ́ inú ọkọ ̀ òfuurufú olóbìrin nìkan wọ ì...
1,Mozambique 's first all-female crew | Photo us...,Ikọ ̀ awakọ ̀ òfuurufú olóbìrin àkọ ́ kọ ́ irú...
2,It is a historic day : that is how many Mozamb...,Ọjọ ́ ìtàn ni ọjọ ́ yìí : ojú yìí ni ogunlọ ́ ...


### Pre-processing and export

In [10]:
# drop duplicate translations
df_pp = df.drop_duplicates()

# Shuffle the data to remove bias in dev set selection.
df_pp = df_pp.sample(frac=1, random_state=seed).reset_index(drop=True)

In [11]:
# !pip install fuzzywuzzy
# !pip install python-Levenshtein

In [12]:
import time
from fuzzywuzzy import process
import numpy as np
from os import cpu_count
from functools import partial
from multiprocessing import Pool

# reset the index of the training set after previous filtering
df_pp.reset_index(drop=False, inplace=True)

# Filtering function. Adjust pad to narrow down the candidate matches to within a certain length of characters of the given sample.
def fuzzfilter(sample, candidates, pad):
  candidates = [x for x in candidates if len(x) <= len(sample)+pad and len(x) >= len(sample)-pad] 
  if len(candidates) > 0:
    return process.extractOne(sample, candidates)[1]
  else:
    return np.nan

In [13]:
start_time = time.time()

with Pool(cpu_count()-1) as pool:
    scores = pool.map(partial(fuzzfilter, candidates=list(en_test_sents), pad=5), df_pp['source_sentence'])
hours, rem = divmod(time.time() - start_time, 3600)
minutes, seconds = divmod(rem, 60)
print("done in {}h:{}min:{}seconds".format(hours, minutes, seconds))

# Filter out "almost overlapping samples"
df_pp = df_pp.assign(scores=scores)
df_pp = df_pp[df_pp['scores'] < 95]

done in 0.0h:0.0min:2.326791286468506seconds


In [14]:
# We use 30 dev test and the given test set.
import csv

# Do the split between dev/train and create parallel corpora
num_dev_patterns = 30

# Optional: lower case the corpora - this will make it easier to generalize, but without proper casing.
if lc:  # Julia: making lowercasing optional
    df_pp["source_sentence"] = df_pp["source_sentence"].str.lower()
    df_pp["target_sentence"] = df_pp["target_sentence"].str.lower()

# Julia: test sets are already generated
dev = df_pp.tail(num_dev_patterns) # Herman: Error in original
stripped = df_pp.drop(df_pp.tail(num_dev_patterns).index)

with open("train."+source_language, "w") as src_file, open("train."+target_language, "w") as trg_file:
  for index, row in stripped.iterrows():
    src_file.write(row["source_sentence"]+"\n")
    trg_file.write(row["target_sentence"]+"\n")
    
with open("dev."+source_language, "w") as src_file, open("dev."+target_language, "w") as trg_file:
  for index, row in dev.iterrows():
    src_file.write(row["source_sentence"]+"\n")
    trg_file.write(row["target_sentence"]+"\n")

stripped[["source_sentence"]].to_csv("train."+source_language, header=False, index=False)  # Herman: Added `header=False` everywhere
stripped[["target_sentence"]].to_csv("train."+target_language, header=False, index=False)  # Julia: Problematic handling of quotation marks.

dev[["source_sentence"]].to_csv("dev."+source_language, header=False, index=False)
dev[["target_sentence"]].to_csv("dev."+target_language, header=False, index=False)

# Doublecheck the format below. There should be no extra quotation marks or weird characters.
! head train.*
! head dev.*

==> train.en <==
"The festivals can boost consumption , what ’ s wrong with that ?"
Child : I will not celebrate Western festivals Christmas is not a Chinese people ’ s festival .
I am Chinese and I don ’ t celebrate Western festivals .
"The commentary references the history of the Eight-Nation Alliance , a coalition formed in response to the Boxer Rebellion in China between 1899 and 1901 when Chinese peasants rose up against foreign , colonial , Christian rule and culture ."
"In their panel session , they credited Global Voices ' campaign for keeping them alive ."
The company has forbidden the celebration of Western festivals .
"Yet , the Zone9 bloggers redefined patriotism with both their words and actions ."
A Weibo user expressed frustration :
"If people of a nation are too enthusiastic in celebrating other nations ’ festivals , it indicates that the country is suffered from extremely serious cultural invasion ."
"In aviation there are few women , very few , this is not only here b

In [15]:
# !pip install datasets evaluate transformers[sentencepiece]

# # Install Pytorch with GPU support v1.7.1.
# ! pip3 install torch==1.10.0+cu113 torchvision==0.11.1+cu113 torchaudio===0.10.0+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html

## Preprocessing the Data (pytorch)

In [1]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

# Same as before
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

# This is new
batch["labels"] = torch.tensor([1, 1])

optimizer = torch.optim.AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("yoruba_text_c3")
raw_datasets

Reusing dataset yoruba_text_c3 (/root/.cache/huggingface/datasets/yoruba_text_c3/yoruba_text_c3/1.0.0/dbf0b0085c03d98b73f35fe80f4e75928025b483958c24f86440104921830d98)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 562238
    })
})

In [3]:
raw_train_dataset = raw_datasets["train"]
raw_train_dataset[0]

{'text': 'lílo àkàbà — ǹjẹ́ o máa ń ṣe àyẹ̀wò wọ̀nyí tó lè dáàbò bò ẹ́'}

In [4]:
raw_train_dataset.features

{'text': Value(dtype='string', id=None)}

In [5]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["text"])

Token indices sequence length is longer than the specified maximum sequence length for this model (3774 > 512). Running this sequence through the model will result in indexing errors


In [6]:
inputs = tokenizer("This is the first sentence.", "This is the second one.")
inputs

{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
tokenizer.convert_ids_to_tokens(inputs["input_ids"])

['[CLS]',
 'this',
 'is',
 'the',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

In [None]:
tokenized_dataset = tokenizer(
    raw_datasets["train"]["text"],
    padding=True,
    truncation=True,
)

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)