# **Machine learning for low-resource NLP**: Advancing AI for Linguistic Inclusion
Cross-lingual transfer learning and pseudo-labeling for multilingual named entity recognition

**General Imports:** Import fundamental libraries

In [None]:
import torch
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from IPython.display import display, Markdown

import torch
from transformers import BertTokenizerFast
from config import BaseConfig
from torch.utils.data import DataLoader

**User-defined Imports:** Import custom classes and functions

In [None]:
from model import BertBilstmCrf
from train import train_model, evaluate_epoch
from utils.dataloader import create_dataloaders
from pseudo_labeling import train_pseudo_labeling

**Configs:** Constants for model training

In [1]:
import torch

class BaseConfig:
    # Miscelanous
    RANDOM_STATE          = 42
    DEVICE                = torch.device("cuda")

    # Data
    low_resource_langs    = ["mg", "fo", "co", "hsb", "bh", "cv"]
    high_resource_langs   = ["id", "da", "it", "pl", "hi", "tr"]

    NUM_TAGS              = 7
    BATCH_SIZE            = 32
    MAX_SEQ_LEN           = 80

class TrainConfig(BaseConfig):
    EPOCHS                = 20
    PATIENCE              = 5
    BERT_LEARNING_RATE    = 0.00003
    LSTM_LEARNING_RATE    = 0.005
    CRF_LEARNING_RATE     = 0.00005
    WEIGHT_DECAY          = 0.02

class FineTuneConfig(BaseConfig):
    EPOCHS                = 15
    PATIENCE              = 3
    BERT_LEARNING_RATE    = 0.00002
    LSTM_LEARNING_RATE    = 0.003
    CRF_LEARNING_RATE     = 0.00003

class PseudoLabelingConfig(BaseConfig):
    EPOCHS                = 25
    PATIENCE              = 5
    BERT_LEARNING_RATE    = 0.00002
    LSTM_LEARNING_RATE    = 0.003
    CRF_LEARNING_RATE     = 0.00003

    CONFIDENCE_QUANTILE   = 0.965
    PSEUDO_DELAY          = 8
    ENTROPY_THRESHOLD     = 0.2

**Set Random Seed:** Ensure random seeds are all set to esnure reproducibility of results

In [3]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    np.random.seed(seed)
    random.seed(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(TrainConfig.RANDOM_STATE)

**Data Processing:** Load WikiANN data from HuggingFace and split into train/val/test

In [4]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split

def load_wikiann_datasets(language_codes, cutoff=None):

    language_data = {}
    for lang in language_codes:
        
        # Load raw data from hugging face
        lang_dataset = load_dataset("unimelb-nlp/wikiann", name=lang)

        # Get data from different splits and combine
        train_df = pd.DataFrame(lang_dataset["train"])
        val_df = pd.DataFrame(lang_dataset["validation"])
        test_df = pd.DataFrame(lang_dataset["test"])

        complete_df = pd.concat([train_df, val_df, test_df]).reset_index(drop=True)
        complete_df = complete_df.head(cutoff) if cutoff else complete_df

        # Split data into new train/val/test splits
        train, temp = train_test_split(complete_df, test_size=0.2, random_state=TrainConfig.RANDOM_STATE)
        val, test = train_test_split(temp, test_size=0.5, random_state=TrainConfig.RANDOM_STATE)

        language_data[lang] = {"train": train, "val": val, "test": test}

    return language_data

# Download and store data
low_resource_datasets = load_wikiann_datasets(TrainConfig.low_resource_langs)

**NER Dataset:** Create dataset for Wikiann NER data

In [None]:
class NERDataset:
    def __init__(self, texts, tags, include_sentence = False):
        self.texts = texts
        self.tags = tags

        self.tokenizer = BertTokenizerFast.from_pretrained(
            "google-bert/bert-base-multilingual-cased", do_lower_case = True
        )

        self.CLS_TOKEN = [101]
        self.SEP_TOKEN = [102]
        self.PAD_TOKEN = [0]
        self.MAX_LEN = BaseConfig.MAX_SEQ_LEN

        # Determines if the original sentence is returned for each batch
        self.include_sentence = include_sentence

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index]
        tags = self.tags[index]

        token_ids = []
        target_tags = []
        for i, word in enumerate(text):
            word_ids = self.tokenizer.encode(word, add_special_tokens = False)
            token_ids.extend(word_ids)
            target_tags.extend(len(word_ids) * [tags[i]])

        # Resize for special tokens
        token_ids = token_ids[:self.MAX_LEN - 2]
        target_tags = target_tags[:self.MAX_LEN - 2]

        # Add special tokens
        token_ids = self.CLS_TOKEN + token_ids + self.SEP_TOKEN
        target_tags = self.PAD_TOKEN + target_tags + self.PAD_TOKEN

        attention_mask = [1] * len(token_ids)
        token_type_ids = [0] * len(token_ids)

        # Add padding to make sure all inputs are the same size
        padding_len = self.MAX_LEN - len(token_ids)
        token_ids += [0] * padding_len
        target_tags += [0] * padding_len
        attention_mask += [0] * padding_len
        token_type_ids += [0] * padding_len

        if self.include_sentence:
            return {
                "input_ids": torch.tensor(token_ids, dtype = torch.long),
                "target_tags": torch.tensor(target_tags, dtype = torch.long),
                "attention_mask": torch.tensor(attention_mask, dtype = torch.long),
                "token_type_ids": torch.tensor(token_type_ids, dtype = torch.long),
                "orginal_text": " ".join(text)
            }

        return {
            "input_ids": torch.tensor(token_ids, dtype = torch.long),
            "target_tags": torch.tensor(target_tags, dtype = torch.long),
            "attention_mask": torch.tensor(attention_mask, dtype = torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype = torch.long)
        }

**Dataloaders:** Define functions for creating dataloaders

In [None]:
def create_dataloader(lang_split_data, CONFIG, include_sentence=False):
    dataset = NERDataset(
        lang_split_data["tokens"].to_list(),
        lang_split_data["ner_tags"].to_list(),
        include_sentence = include_sentence
    )
    return DataLoader(dataset, CONFIG.BATCH_SIZE)

def create_dataloaders(lang_data, CONFIG):

    train_loader = create_dataloader(lang_data["train"], CONFIG)
    val_loader = create_dataloader(lang_data["val"], CONFIG)
    test_loader = create_dataloader(lang_data["test"], CONFIG)

    return train_loader, val_loader, test_loader

**Setup Optimizer:** Setup optimizer with different learning rates for seperate layers

In [5]:
import torch.optim as optim

def setup_optimizer(model):
    param_groups = []
    # Check model layers and add appropiate learning rates
    if hasattr(model, "bert"):
        param_groups.append({"params" : model.bert.parameters(), "lr" : TrainConfig.BERT_LEARNING_RATE})
    if hasattr(model, "lstm"):
        param_groups.append({"params" : model.lstm.parameters(), "lr" : TrainConfig.LSTM_LEARNING_RATE})
    if hasattr(model, "crf"):
        param_groups.append({"params" : model.crf.parameters(), "lr" : TrainConfig.CRF_LEARNING_RATE})
    optimizer = optim.Adam(param_groups, weight_decay = TrainConfig.WEIGHT_DECAY)

    return optimizer

**Baseline:** Train baseline models; save weights and performance scores for analysis

In [None]:
baseline_results = []

# Iterate through low-resource languages
for lang, lang_data in tqdm(low_resource_datasets.items(), ncols=80):

    train_loader, val_loader, test_loader = create_dataloaders(lang_data, TrainConfig)

    # ------------------------------------------ TRAINING ------------------------------------------ #

    model = BertBilstmCrf(TrainConfig.NUM_TAGS).to(TrainConfig.DEVICE)
    optimizer = setup_optimizer(model)
    best_model_state, train_f1, val_f1 = train_model(model, optimizer, train_loader, val_loader, TrainConfig)

    # ------------------------------------------ EVALUATION ------------------------------------------ #
    eval_model = BertBilstmCrf(TrainConfig.NUM_TAGS).to(TrainConfig.DEVICE)
    eval_model.load_state_dict(best_model_state)
    test_loss, test_f1 = evaluate_epoch(eval_model, test_loader, TrainConfig)

    # ------------------------------------------ RESULTS ------------------------------------------ #
    torch.save(best_model_state, f"models/{lang}_baseline.pth")

    baseline_results.append({
        "language" : lang,
        "train_f1" : train_f1,
        "val_f1"   : val_f1,
        "test_f1"  : test_f1
    })
    
# Save and display results
baseline = pd.DataFrame(baseline_results)
baseline.to_csv("results/baseline.csv", index=False)

markdown_table = baseline.to_markdown(index=False)
display(Markdown(markdown_table))

100%|█████████████████████████████████████████████| 6/6 [04:31<00:00, 45.20s/it]


| language   |   train_f1 |   val_f1 |   test_f1 |
|:-----------|-----------:|---------:|----------:|
| mg         |   0.993728 | 0.933673 |  0.960352 |
| fo         |   0.974224 | 0.897482 |  0.901099 |
| co         |   0.956204 | 0.852308 |  0.81323  |
| hsb        |   0.951443 | 0.923387 |  0.854578 |
| bh         |   0.981549 | 0.888689 |  0.80212  |
| cv         |   0.977741 | 0.892617 |  0.830443 |

In [6]:
baseline = pd.read_csv("results/baseline.csv")

**Cross-lingual transfer learning:** Train model on high-resource language data, then fine-tune on target low-resource language

In [None]:
transfer_results = []

for augmentation_factor in tqdm(range(1, 24), ncols=80):

    high_resource_datasets = load_wikiann_datasets(TrainConfig.high_resource_langs, augmentation_factor * 240)

    # Iterate through low-resource and adjacent high-resource languages
    for (low_resource_lang, low_resource_data), (high_resource_lang, high_resource_data) in tqdm(zip(
            low_resource_datasets.items(), high_resource_datasets.items()
        ), ncols=80, leave=False):

        high_train_loader, high_val_loader, _ = create_dataloaders(high_resource_data, TrainConfig)
        low_train_loader, low_val_loader, low_test_loader = create_dataloaders(low_resource_data, TrainConfig)

        # ------------------------------------------ PRE-TRAINING ------------------------------------------ #

        high_resource_model = BertBilstmCrf(TrainConfig.NUM_TAGS).to(TrainConfig.DEVICE)
        optimizer = setup_optimizer(high_resource_model)

        high_resource_model_state, train_f1, val_f1 = train_model(high_resource_model, optimizer, high_train_loader, high_val_loader, TrainConfig)

        # ------------------------------------------ FINE-TUNING ------------------------------------------ #

        model = BertBilstmCrf(FineTuneConfig.NUM_TAGS).to(FineTuneConfig.DEVICE)
        model.load_state_dict(high_resource_model_state)
        optimizer = setup_optimizer(model)

        best_model_state, train_f1, val_f1 = train_model(model, optimizer, low_train_loader, low_val_loader, FineTuneConfig)

        # ------------------------------------------ EVALUATION ------------------------------------------ #

        eval_model = BertBilstmCrf(FineTuneConfig.NUM_TAGS).to(FineTuneConfig.DEVICE)
        eval_model.load_state_dict(best_model_state)
        test_loss, test_f1 = evaluate_epoch(eval_model, low_test_loader, FineTuneConfig)

        # ------------------------------------------ RESULTS ------------------------------------------ #
        torch.save(best_model_state, f"models/{low_resource_lang}_{high_resource_lang}_transfer.pth")

        baseline_performance = baseline.loc[baseline["language"] == low_resource_lang, "test_f1"].item()
        improvement = (test_f1 - baseline_performance) / baseline_performance * 100

        transfer_results.append({
            "high_resource_language" : high_resource_lang,
            "low_resource_lang"      : low_resource_lang,
            "augmentation_factor"    : augmentation_factor,
            "train_f1"               : train_f1,
            "val_f1"                 : val_f1,
            "test_f1"                : test_f1,
            "improvement"            : improvement
        })

        print(f"Aug: {augmentation_factor}  {low_resource_lang} Improvement over baseline: {improvement:.5f}")

transfer_data = pd.DataFrame(transfer_results)
transfer_data.to_csv("results/transfer_learning.csv", index=False)

In [7]:
# Load unlabeled text data
with open("data/unlabeled/bh_texts.txt") as file:
    unlabeled_sentences = file.readlines()

unlabeled_data = pd.DataFrame({"tokens" : unlabeled_sentences})
unlabeled_data["tokens"] = unlabeled_data["tokens"].apply(lambda sent: sent.split())
unlabeled_data["ner_tags"] = unlabeled_data["tokens"].apply(lambda sent: [0] * len(sent))

state = torch.load("models/bh_baseline.pth")
model = BertBilstmCrf(TrainConfig.NUM_TAGS).to(TrainConfig.DEVICE)
model.load_state_dict(state)

from pseudo_labeling import generate_pseudo_labels
df = generate_pseudo_labels(model, unlabeled_data, TrainConfig)

In [8]:
df

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,tokens,ner_tags,confidence_score,entropy
0,"[आर.एच., सॉन्डर्स, (सेंट, लॉरेंस, नदी), (968, ...","[1, 1, 1, 2, 0, 2, 2]",0.737924,0.772461
1,"['Anders, Lindström']","[0, 1]",0.741672,0.800781
2,"[Karl, Ove, Knausgård, (, जनम, 1968, )]","[0, 0, 0, 1, 1, 1, 0]",0.700457,0.963379
3,"[अटलांटिक, सिटी,, न्यू, जर्सी]","[5, 5, 5, 5]",0.758022,0.772949
4,"[ओकरा, दोसरा, बियाह, से, बिटिया, रहल, Marie, d...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.973245,0.159058
...,...,...,...,...
9997,"[गोरिल्लाज़, –, ``, डेयर, '']","[5, 5, 5, 5, 5]",0.803606,0.486328
9998,"[इंग्लैंड, के, राजा, हेनरी, प्रथम]","[1, 1, 1, 1, 2]",0.773381,0.898438
9999,"[''जोर, से, बोल'']","[0, 0, 0]",0.778720,0.427246
10000,"[Hampton, Beach,, न्यू, हैम्पशायर]","[0, 1, 1, 0]",0.703552,1.029297


In [None]:
iterative_pseudo_labeling_results = []

# Iterate through low-resource languages
high_resource_datasets = load_wikiann_datasets(TrainConfig.high_resource_langs, 10000)

# Iterate through low-resource and adjacent high-resource languages
for (lang, low_resource_data), (_, high_resource_data) in tqdm(zip(
        low_resource_datasets.items(), high_resource_datasets.items()
    ), ncols=80, leave=False):

    train_loader, val_loader, test_loader = create_dataloaders(low_resource_data, TrainConfig)
    unlabeled_data = high_resource_data["train"]

    # ------------------------------------------ TRAINING ------------------------------------------ #

    model = BertBilstmCrf(TrainConfig.NUM_TAGS).to(TrainConfig.DEVICE)
    model.load_state_dict(torch.load(f"models/{lang}_baseline.pth"))
    optimizer = setup_optimizer(model)
    best_model_state, train_f1, val_f1 = train_pseudo_labeling(model, optimizer, train_loader, val_loader, unlabeled_data, PseudoLabelingConfig)

    # ------------------------------------------ EVALUATION ------------------------------------------ #
    
    eval_model = BertBilstmCrf(TrainConfig.NUM_TAGS).to(TrainConfig.DEVICE)
    eval_model.load_state_dict(best_model_state)
    test_loss, test_f1 = evaluate_epoch(eval_model, test_loader, TrainConfig)

    # ------------------------------------------ RESULTS ------------------------------------------ #
    torch.save(best_model_state, f"models/{lang}_iterative_pseudo_labeling.pth")

    baseline_performance = baseline.loc[baseline["language"] == lang, "test_f1"].item()
    improvement = (test_f1 - baseline_performance) / baseline_performance * 100

    iterative_pseudo_labeling_results.append({
        "language"    : lang,
        "train_f1"    : train_f1,
        "val_f1"      : val_f1,
        "test_f1"     : test_f1,
        "improvement" : improvement
    })

    print(f"Language: {lang}    Improvement over baseline: {improvement:.5f}")


# Save results
iterative_pseudo_labeling = pd.DataFrame(iterative_pseudo_labeling_results)
iterative_pseudo_labeling.to_csv("results/iterative_pseudo_labeling.csv", index=False)

In [None]:
iterative_pseudo_labeling_results = []

# Iterate through low-resource languages
high_resource_datasets = load_wikiann_datasets(TrainConfig.high_resource_langs, 10000)

# Iterate through low-resource and adjacent high-resource languages
for (lang, low_resource_data), (_, high_resource_data) in tqdm(zip(
        low_resource_datasets.items(), high_resource_datasets.items()
    ), ncols=80, leave=False):

    train_loader, val_loader, test_loader = create_dataloaders(low_resource_data, TrainConfig)
    unlabeled_data = high_resource_data["train"]

    # ------------------------------------------ TRAINING ------------------------------------------ #

    model = BertBilstmCrf(TrainConfig.NUM_TAGS).to(TrainConfig.DEVICE)
    optimizer = setup_optimizer(model)
    best_model_state, train_f1, val_f1 = train_pseudo_labeling(model, optimizer, train_loader, val_loader, unlabeled_data, PseudoLabelingConfig)

    # ------------------------------------------ EVALUATION ------------------------------------------ #
    
    eval_model = BertBilstmCrf(TrainConfig.NUM_TAGS).to(TrainConfig.DEVICE)
    eval_model.load_state_dict(best_model_state)
    test_loss, test_f1 = evaluate_epoch(eval_model, test_loader, TrainConfig)

    # ------------------------------------------ RESULTS ------------------------------------------ #
    torch.save(best_model_state, f"models/{lang}_iterative_pseudo_labeling.pth")

    baseline_performance = baseline.loc[baseline["language"] == lang, "test_f1"].item()
    improvement = (test_f1 - baseline_performance) / baseline_performance * 100

    iterative_pseudo_labeling_results.append({
        "language"    : lang,
        "train_f1"    : train_f1,
        "val_f1"      : val_f1,
        "test_f1"     : test_f1,
        "improvement" : improvement
    })

    print(f"Language: {lang}    Improvement over baseline: {improvement:.5f}")


# Save results
iterative_pseudo_labeling = pd.DataFrame(iterative_pseudo_labeling_results)
iterative_pseudo_labeling.to_csv("results/iterative_pseudo_labeling.csv", index=False)

0it [00:00, ?it/s]

Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Added 279 rows of data
Added 10 rows of data
Added 126 rows of data
Added 0 rows of data
Added 79 rows of data


1it [07:48, 468.02s/it]

Language: mg    Improvement over baseline: 0.00000
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Added 71 rows of data
Added 3 rows of data
Added 36 rows of data
Added 121 rows of data
Added 280 rows of data


2it [15:29, 464.31s/it]

Language: fo    Improvement over baseline: -1.74216
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Added 0 rows of data
Added 123 rows of data
Added 0 rows of data
Added 6 rows of data
Added 0 rows of data
Added 0 rows of data
Added 66 rows of data
Added 0 rows of data
Added 0 rows of data
Added 2 rows of data
Added 1 rows of data
Added 3 rows of data
Added 4 rows of data
Added 4 rows of data
Added 4 rows of data
Added 4 rows of data


3it [29:21, 632.12s/it]

Language: co    Improvement over baseline: 7.65550
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data


4it [41:23, 667.59s/it]

Language: hsb    Improvement over baseline: -0.63025
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Added 110 rows of data
Added 196 rows of data
Added 195 rows of data


5it [45:35, 517.62s/it]

Language: bh    Improvement over baseline: 3.96476
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Early epoch
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data


                       

Language: cv    Improvement over baseline: -3.24826




**Iterative Pseudo Labeling:** During the training process, continously generate labels for unlabeled data and add high-confidence pseudo-labels to original training dataset

In [None]:
iterative_pseudo_labeling_results = []

# Iterate through low-resource languages
for lang, lang_data in low_resource_datasets.items():

    train_loader, val_loader, test_loader = create_dataloaders(lang_data, PseudoLabelingConfig)

    # Load unlabeled text data
    with open(f"data/unlabeled/{lang}_texts.txt") as file:
        unlabeled_sentences = file.readlines()

    unlabeled_data = pd.DataFrame({"tokens" : unlabeled_sentences})
    unlabeled_data["tokens"] = unlabeled_data["tokens"].apply(lambda sent: sent.split())
    unlabeled_data["ner_tags"] = unlabeled_data["tokens"].apply(lambda sent: [0] * len(sent))

    # ------------------------------------------ TRAINING ------------------------------------------ #

    model = BertBilstmCrf(TrainConfig.NUM_TAGS).to(TrainConfig.DEVICE)
    optimizer = setup_optimizer(model)
    best_model_state, train_f1, val_f1 = train_pseudo_labeling(model, optimizer, train_loader, val_loader, unlabeled_data, PseudoLabelingConfig)

    # ------------------------------------------ EVALUATION ------------------------------------------ #
    
    eval_model = BertBilstmCrf(TrainConfig.NUM_TAGS).to(TrainConfig.DEVICE)
    eval_model.load_state_dict(best_model_state)
    test_loss, test_f1 = evaluate_epoch(eval_model, test_loader, TrainConfig)

    # ------------------------------------------ RESULTS ------------------------------------------ #
    torch.save(best_model_state, f"models/{lang}_iterative_pseudo_labeling.pth")

    baseline_performance = baseline.loc[baseline["language"] == lang, "test_f1"].item()
    improvement = (test_f1 - baseline_performance) / baseline_performance * 100

    iterative_pseudo_labeling_results.append({
        "language"    : lang,
        "train_f1"    : train_f1,
        "val_f1"      : val_f1,
        "test_f1"     : test_f1,
        "improvement" : improvement
    })

    print(f"Language: {lang}    Improvement over baseline: {improvement:.5f}")


# Save results
iterative_pseudo_labeling = pd.DataFrame(iterative_pseudo_labeling_results)
iterative_pseudo_labeling.to_csv("results/iterative_pseudo_labeling.csv", index=False)

Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 324 rows of data
Added 0 rows of data
Language: mg    Improvement over baseline: -1.37615
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Added 41 rows of data
Added 72 rows of data
Added 0 rows of data
Added 0 rows of data
Added 0 rows of data
Language: fo    Improvement over baseline: -5.57491
Added 0 rows of data
Added 0 rows of data
Added 21 rows of data
Added 5 rows of data
Added 0 rows of data
Added 2 rows of data
Added 0 rows of data
Added 85 rows of data
Added 0 rows of data
Added 0 rows of data
Language: co    Improvement over baseline: -8.61244
Added 0 rows of data
Added 0 rows of data
Added 2 rows of data
Added 21 rows of data
Added 5 rows of data
Added 9 rows of data
Added 8 rows of data
Added 26 rows of data
Added 0 rows of data
Added 3 rows of data
Added 10 rows of data
Added 11 rows of data
Added 4 rows of

KeyboardInterrupt: 