# Imports and installations

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 57.3 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 38.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 5.1 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import concatenate_datasets, load_dataset, Dataset
from transformers import (AutoModelWithLMHead,
                          AutoTokenizer,
                          TFAutoModelWithLMHead,
                          TFAutoModelForCausalLM,
                          AutoModelForCausalLM,
                          AutoConfig,
                          Trainer,
                          TextDataset,
                          DataCollatorForLanguageModeling,
                          TrainingArguments,
                          TrainerCallback,
                          pipeline
)

import os
from tqdm import tqdm

# CREATING AND PROCESSING DATASET PAWS-x

## Loading datasets

In [None]:
dataset_en = load_dataset("paws-x",'en')

Downloading builder script:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading and preparing dataset pawsx/en (download: 28.88 MiB, generated: 12.59 MiB, post-processed: Unknown size, total: 41.47 MiB) to /root/.cache/huggingface/datasets/pawsx/en/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755...


Downloading data:   0%|          | 0.00/30.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset pawsx downloaded and prepared to /root/.cache/huggingface/datasets/pawsx/en/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
dataset_es = load_dataset("paws-x",'es')

Downloading and preparing dataset pawsx/es (download: 28.88 MiB, generated: 13.20 MiB, post-processed: Unknown size, total: 42.08 MiB) to /root/.cache/huggingface/datasets/pawsx/es/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755...


Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset pawsx downloaded and prepared to /root/.cache/huggingface/datasets/pawsx/es/1.1.0/37673404a6de6d0fa2574661e77940d10d3be3bf51bb4f08c8fa079fd56b9755. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
dataset_es

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

In [None]:
dataset_en

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

## Concatenate default datasets

In [None]:
dataset_es = concatenate_datasets([dataset_es['train'], dataset_es['train'], dataset_es['validation']])
dataset_en = concatenate_datasets([dataset_en['train'], dataset_en['train'], dataset_en['validation']])

In [None]:
dataset_en

Dataset({
    features: ['id', 'sentence1', 'sentence2', 'label'],
    num_rows: 100802
})

In [None]:
lan_es_column = ['es'] * len(dataset_es)
lan_en_column = ['en'] * len(dataset_en)
dataset_es = dataset_es.add_column("lan", lan_es_column )
dataset_en = dataset_en.add_column("lan", lan_en_column)

In [None]:
dataset_comb = concatenate_datasets([dataset_es, dataset_en])

In [None]:
dataset_comb

Dataset({
    features: ['id', 'sentence1', 'sentence2', 'label', 'lan'],
    num_rows: 201604
})

## Filtering

In [None]:
dataset_comb = dataset_comb.filter(lambda example: example['label'] == 1)
print(dataset_comb)

  0%|          | 0/202 [00:00<?, ?ba/s]

Dataset({
    features: ['id', 'sentence1', 'sentence2', 'label', 'lan'],
    num_rows: 89026
})


## Shuffling

In [None]:
PAWS_X =  dataset_comb.shuffle(seed=42)

## To dataframe

In [None]:
PAWS_X = pd.DataFrame(PAWS_X, index = PAWS_X['id'] )


In [None]:
PAWS_X = PAWS_X.drop('id', 1)

  """Entry point for launching an IPython kernel.


In [None]:
PAWS_X

Unnamed: 0,sentence1,sentence2,label,lan
19990,La esposa de Quinatzin era una princesa de Hue...,La esposa de Quinatzin era una princesa de Hue...,1,es
10801,"The academy consists of east hall , central wi...","The academy consists of eastern hall , central...",1,en
20285,Critical Millennium is a 2010 graphic novel pu...,Critical Millennium is a novel published by Ar...,1,en
39049,Stenolechia zelosaris is a moth of the family ...,Stenolechia zelosaris is a moth from the famil...,1,en
28878,Gary Lucy was paired with Vanilla Ice in serie...,Gary Lucy was paired with Vanilla Ice in serie...,1,en
...,...,...,...,...
29788,"El 30 de agosto de 1853, la hija de Stephen Ch...","El 30 de agosto de 1853, la hija de Stephen Ch...",1,es
8953,Other types of course offered by the Departmen...,Other types of courses offered by the departme...,1,en
12708,"Algunos tienen efectos negativos, algunos posi...",Algunos tienen efectos negativos y algunos pos...,1,es
605,"Con el debilitamiento del dólar canadiense, la...","En noviembre y diciembre de 2015, con el debil...",1,es


In [None]:
PAWS_X = PAWS_X.drop('label', 1)

  """Entry point for launching an IPython kernel.


In [None]:
PAWS_X = PAWS_X.reset_index(drop=True)


In [None]:
PAWS_X

Unnamed: 0,sentence1,sentence2,lan
0,La esposa de Quinatzin era una princesa de Hue...,La esposa de Quinatzin era una princesa de Hue...,es
1,"The academy consists of east hall , central wi...","The academy consists of eastern hall , central...",en
2,Critical Millennium is a 2010 graphic novel pu...,Critical Millennium is a novel published by Ar...,en
3,Stenolechia zelosaris is a moth of the family ...,Stenolechia zelosaris is a moth from the famil...,en
4,Gary Lucy was paired with Vanilla Ice in serie...,Gary Lucy was paired with Vanilla Ice in serie...,en
...,...,...,...
89021,"El 30 de agosto de 1853, la hija de Stephen Ch...","El 30 de agosto de 1853, la hija de Stephen Ch...",es
89022,Other types of course offered by the Departmen...,Other types of courses offered by the departme...,en
89023,"Algunos tienen efectos negativos, algunos posi...",Algunos tienen efectos negativos y algunos pos...,es
89024,"Con el debilitamiento del dólar canadiense, la...","En noviembre y diciembre de 2015, con el debil...",es


## Saving dataset

In [None]:
PAWS_X.to_csv(r'/content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/datasets/PAWSX/PAWS_X.csv',index=False)


# CREATING AND PROCESSING DATASET TAPACO

## Loading datasets

In [None]:
tapaco_es = load_dataset("tapaco", "es")
tapaco_en = load_dataset("tapaco", "en")

Reusing dataset tapaco (/root/.cache/huggingface/datasets/tapaco/es/1.0.0/71d200534b520a174927a8f0479c06220a0a6fb5201a84ebfce19006c6354698)


  0%|          | 0/1 [00:00<?, ?it/s]

Reusing dataset tapaco (/root/.cache/huggingface/datasets/tapaco/en/1.0.0/71d200534b520a174927a8f0479c06220a0a6fb5201a84ebfce19006c6354698)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
tapaco_es['train']

Dataset({
    features: ['paraphrase_set_id', 'sentence_id', 'paraphrase', 'lists', 'tags', 'language'],
    num_rows: 85064
})

In [None]:
tapaco_en['train']

Dataset({
    features: ['paraphrase_set_id', 'sentence_id', 'paraphrase', 'lists', 'tags', 'language'],
    num_rows: 158053
})

## Processsing datasets

In [None]:


#Loading dataset
dataset1 = load_dataset('tapaco', 'en')
dataset2 = load_dataset('tapaco', 'es')
def process_tapaco_dataset(dataset, out_file):
    tapaco = []
    # The dataset has only train split.
    for data in tqdm(dataset["train"]):
        keys = data.keys()
        tapaco.append([data[key] for key in keys])
    tapaco_df = pd.DataFrame(
        data=tapaco,
        columns=[
            "paraphrase_set_id",
            "sentence_id 	",
            "paraphrase",
            "lists", 
            "tags", 
            "language",
        ],
    )
    tapaco_df.to_csv(out_file, sep="\t", index=None)
    return tapaco_df

tapaco_df_en = process_tapaco_dataset(dataset1,r"/content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/datasets/tapaco/tapaco_en.csv")
tapaco_df_es = process_tapaco_dataset(dataset2,r"/content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/datasets/tapaco/tapaco_es.csv")


100%|██████████| 158053/158053 [00:27<00:00, 5833.37it/s]
100%|██████████| 85064/85064 [00:18<00:00, 4523.44it/s]


In [None]:
def generate_tapaco_paraphrase_dataset(dataset):
    dataset_df = dataset[["paraphrase", "paraphrase_set_id"]]
    non_single_labels = (
        dataset_df["paraphrase_set_id"]
        .value_counts()[dataset_df["paraphrase_set_id"].value_counts() > 1]
        .index.tolist()
    )
    tapaco_df_sorted = dataset_df.loc[
        dataset_df["paraphrase_set_id"].isin(non_single_labels)
    ]
    tapaco_paraphrases_dataset = []

    for paraphrase_set_id in tqdm(tapaco_df_sorted["paraphrase_set_id"].unique()):
        id_wise_paraphrases = tapaco_df_sorted[
            tapaco_df_sorted["paraphrase_set_id"] == paraphrase_set_id
        ]
        len_id_wise_paraphrases = (
            id_wise_paraphrases.shape[0]
            if id_wise_paraphrases.shape[0] % 2 == 0
            else id_wise_paraphrases.shape[0] - 1
        )
        for ix in range(0, len_id_wise_paraphrases, 2):
            current_phrase = id_wise_paraphrases.iloc[ix][0]
            for count_ix in range(ix + 1, ix + 2):
                next_phrase = id_wise_paraphrases.iloc[ix + 1][0]
                tapaco_paraphrases_dataset.append([current_phrase, next_phrase])
    tapaco_paraphrases_dataset_df = pd.DataFrame(
        tapaco_paraphrases_dataset, columns=["Text", "Paraphrase"]
    )
 
    return tapaco_paraphrases_dataset_df
    
tapaco_paraphrases_dataset_df_es = generate_tapaco_paraphrase_dataset(tapaco_df_es)
tapaco_paraphrases_dataset_df_en = generate_tapaco_paraphrase_dataset(tapaco_df_en)

100%|██████████| 32691/32691 [04:23<00:00, 124.02it/s]
100%|██████████| 62044/62044 [17:22<00:00, 59.50it/s]


In [None]:
tapaco_paraphrases_dataset_df_es['lan'] = ['es'] * len(tapaco_paraphrases_dataset_df_es)
tapaco_paraphrases_dataset_df_en['lan'] = ['en'] * len(tapaco_paraphrases_dataset_df_en)

## Concatenat en es datasets

In [None]:
tapaco_x = pd.concat([tapaco_paraphrases_dataset_df_en, tapaco_paraphrases_dataset_df_es], ignore_index=True, sort=False)

## Shuffling

In [None]:
tapaco_x = tapaco_x.sample(frac=1)

In [None]:
tapaco_x = tapaco_x.reset_index(drop=True)

In [None]:
tapaco_x.to_csv("/content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/datasets/tapaco/tapaco_x.csv", index=None)

# CREATING AND PROCESSING COMBINED DATASET(PAWSX + Tapaco)

## Load datasets

In [None]:
  tapaco_x = pd.read_csv("/content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/datasets/tapaco/tapaco_x.csv")
  PAWS_X = pd.read_csv('/content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/datasets/PAWSX/PAWS_X.csv')

In [None]:
PAWS_X = PAWS_X.rename(columns={'sentence1': 'Text', 'sentence2': 'Paraphrase'})

## Concatenate

In [None]:
dataset_comb = pd.concat([tapaco_x, PAWS_X], ignore_index=True, sort=False)

In [None]:
dataset_comb

Unnamed: 0,Text,Paraphrase,lan
0,¡Dese prisa que perdemos el tren!,¡Apúrate o se nos hace tarde para el tren!,es
1,I squeezed Tom's hand.,I shook Tom's hand.,en
2,Tom didn't know who to believe.,Tom didn't know who he should believe.,en
3,Quemaron un poco de carbón.,Ellos quemaron algún carbón.,es
4,I don't think about you.,I am not thinking of you.,en
...,...,...,...
201527,"El 30 de agosto de 1853, la hija de Stephen Ch...","El 30 de agosto de 1853, la hija de Stephen Ch...",es
201528,Other types of course offered by the Departmen...,Other types of courses offered by the departme...,en
201529,"Algunos tienen efectos negativos, algunos posi...",Algunos tienen efectos negativos y algunos pos...,es
201530,"Con el debilitamiento del dólar canadiense, la...","En noviembre y diciembre de 2015, con el debil...",es


## Shuffling

In [None]:
dataset_comb = dataset_comb.sample(frac=1)
dataset_comb = dataset_comb.reset_index(drop=True)

In [None]:
dataset_comb

Unnamed: 0,Text,Paraphrase,lan
0,El gato se come al pequeño ratón.,El gato se comió al ratón.,es
1,"Fue escrito por Nancy Steen y Neil Thompson, d...","Dirigida por Paul Krasny, escrita por Nancy St...",es
2,Where is the coffee shop?,Where is the cafeteria?,en
3,Everybody was startled.,Everybody was stunned.,en
4,Navegó por Sydney y llegó a Río de Janeiro el ...,Navegó por Sydney y llegó el 14 de septiembre ...,es
...,...,...,...
201527,¿Recuerdas quién soy yo?,¿Me reconoces aún?,es
201528,Are you making him a sandwich?,Are you making her a sandwich?,en
201529,Ross - Bridge is a historic bridge in the city...,Ross Bridge is an historic bridge in the town ...,en
201530,"Sin embargo, Michael Jackson, Prince y Madonna...","Sin embargo, Michael Jackson, Prince y Madonna...",es


## Checking and cleaning NAN

In [None]:
dataset_comb = dataset_comb.dropna()

In [None]:
dataset_comb[dataset_comb['Paraphrase'].map(type) != str]

Unnamed: 0,Text,Paraphrase,lan


In [None]:
dataset_comb.to_csv('/content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/datasets/Combined(PAWSX + Tapaco)/dataset_comb.csv', index= False)

# FINE TUNING DISTILLED GPT2

## GENERATING DISTILLGTP2 INPUTS

In [None]:
dataset_comb =  pd.read_csv('/content/drive/MyDrive/working_directory/tfm/Distilled_GPT2_finetuning/datasets/Combined(PAWSX + Tapaco)/dataset_comb.csv')

In [None]:
dataset_comb = dataset_comb[dataset_comb['lan'] == 'en']

In [None]:
dataset_comb

Unnamed: 0,Text,Paraphrase,lan
2,Where is the coffee shop?,Where is the cafeteria?,en
3,Everybody was startled.,Everybody was stunned.,en
5,Did Tom have to do that?,Was Tom forced to do that?,en
6,Tom wouldn't talk to me.,Tom doesn't speak to me.,en
7,Time is running out.,Time flew.,en
...,...,...,...
201519,She cottoned to him soon.,She began to like him right away.,en
201521,He tends to be late for meetings.,He's always late to meetings.,en
201525,Only those who believe in the future believe i...,Only in trusting the future is it possible to ...,en
201528,Are you making him a sandwich?,Are you making her a sandwich?,en


In [None]:
dataset_comb['combined'] = '<s> ' + dataset_comb.Text + ' </s>'+' === '+'<p> '+ dataset_comb.Paraphrase +' </p>'

In [None]:
dataset_comb

Unnamed: 0,Text,Paraphrase,lan,combined
2,Where is the coffee shop?,Where is the cafeteria?,en,<s> Where is the coffee shop? </s> === <p> Whe...
3,Everybody was startled.,Everybody was stunned.,en,<s> Everybody was startled. </s> === <p> Every...
5,Did Tom have to do that?,Was Tom forced to do that?,en,<s> Did Tom have to do that? </s> === <p> Was ...
6,Tom wouldn't talk to me.,Tom doesn't speak to me.,en,<s> Tom wouldn't talk to me. </s> === <p> Tom ...
7,Time is running out.,Time flew.,en,<s> Time is running out. </s> === <p> Time fle...
...,...,...,...,...
201519,She cottoned to him soon.,She began to like him right away.,en,<s> She cottoned to him soon. </s> === <p> She...
201521,He tends to be late for meetings.,He's always late to meetings.,en,<s> He tends to be late for meetings. </s> ===...
201525,Only those who believe in the future believe i...,Only in trusting the future is it possible to ...,en,<s> Only those who believe in the future belie...
201528,Are you making him a sandwich?,Are you making her a sandwich?,en,<s> Are you making him a sandwich? </s> === <p...


In [None]:
dataset_comb_tr, dataset_comb_test = train_test_split(dataset_comb, test_size=0.2)
dataset_comb_test, dataset_comb_val = train_test_split(dataset_comb_test, test_size=0.5)

In [None]:
dataset_comb_tr = dataset_comb_tr.reset_index(drop=True)
dataset_comb_test = dataset_comb_test.reset_index(drop=True)
dataset_comb_val = dataset_comb_test.reset_index(drop=True)

In [None]:
print(len(dataset_comb_tr), len(dataset_comb_test), len(dataset_comb_val))

94431 11804 11804


In [None]:
dataset_comb_tr[['combined','lan']]

In [None]:
dataset_comb.to_csv('/content/drive/MyDrive/working_directory/tfm/Distilled_GPT2_finetuning/datasets/Combined(PAWSX + Tapaco)/distilgpt2_input/dataset_comb_input.csv', index = False)
dataset_comb_tr.combined.to_csv('/content/drive/MyDrive/working_directory/tfm/Distilled_GPT2_finetuning/datasets/Combined(PAWSX + Tapaco)/distilgpt2_input/dataset_comb_tr.csv', index = False)
dataset_comb_test.combined.to_csv('/content/drive/MyDrive/working_directory/tfm/Distilled_GPT2_finetuning/datasets/Combined(PAWSX + Tapaco)/distilgpt2_input/dataset_comb_test.csv', index = False)
dataset_comb_val.combined.to_csv('/content/drive/MyDrive/working_directory/tfm/Distilled_GPT2_finetuning/datasets/Combined(PAWSX + Tapaco)/distilgpt2_input/dataset_comb_val.csv', index = False)

In [None]:
dataset_comb_tr = dataset_comb[['combined']]
dataset_comb_test = dataset_comb[['combined']]
dataset_comb_val = dataset_comb[['combined']]

In [None]:
dataset_comb_tr

Unnamed: 0,combined
2,<s> Where is the coffee shop? </s> === <p> Whe...
3,<s> Everybody was startled. </s> === <p> Every...
5,<s> Did Tom have to do that? </s> === <p> Was ...
6,<s> Tom wouldn't talk to me. </s> === <p> Tom ...
7,<s> Time is running out. </s> === <p> Time fle...
...,...
201519,<s> She cottoned to him soon. </s> === <p> She...
201521,<s> He tends to be late for meetings. </s> ===...
201525,<s> Only those who believe in the future belie...
201528,<s> Are you making him a sandwich? </s> === <p...


In [None]:
dataset_comb.to_csv('/content/drive/MyDrive/working_directory/tfm/Distilled_GPT2_finetuning/datasets/Combined(PAWSX + Tapaco)/distilgpt2_input/dataset_comb_input.txt', sep='\t', index = False)
dataset_comb_tr.combined.to_csv('/content/drive/MyDrive/working_directory/tfm/Distilled_GPT2_finetuning/datasets/Combined(PAWSX + Tapaco)/distilgpt2_input/dataset_comb_tr.txt', sep='\t', index = False)
dataset_comb_test.combined.to_csv('/content/drive/MyDrive/working_directory/tfm/Distilled_GPT2_finetuning/datasets/Combined(PAWSX + Tapaco)/distilgpt2_input/dataset_comb_test.txt', sep='\t', index = False)
dataset_comb_val.combined.to_csv('/content/drive/MyDrive/working_directory/tfm/Distilled_GPT2_finetuning/datasets/Combined(PAWSX + Tapaco)/distilgpt2_input/dataset_comb_val.txt', sep='\t', index = False)

## TRAINING DISTILLED GPT2

## With hugging facetrainer API

In [None]:
!pip install wandb


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.13.2-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 1.3 MB/s 
[?25hCollecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.9-py3-none-any.whl (9.4 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.9.5-py2.py3-none-any.whl (157 kB)
[K     |████████████████████████████████| 157 kB 11.5 MB/s 
[?25hCollecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting setproctitle
  Downloading setproctitle-1.3.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 13.9 MB/s 
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4

In [None]:
import wandb
from wandb.keras import WandbCallback
wandb.login()


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
%env WANDB_PROJECT=distillgpt2TrainerAPI

env: WANDB_PROJECT=distillgpt2TrainerAPI


In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

model = AutoModelForCausalLM.from_pretrained("distilgpt2")

Downloading config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/336M [00:00<?, ?B/s]

In [None]:
text_path = '/content/drive/MyDrive/working_directory/tfm/Distilled_GPT2_finetuning/datasets/Combined(PAWSX + Tapaco)/distilgpt2_input/'

In [None]:
data_collator = DataCollatorForLanguageModeling (tokenizer=tokenizer, mlm=False)

In [None]:
train_dataset = TextDataset(
tokenizer=tokenizer,
file_path=text_path+'dataset_comb_tr.txt',
block_size=512)

Token indices sequence length is longer than the specified maximum sequence length for this model (5002966 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
test_dataset = TextDataset(
tokenizer=tokenizer,
file_path=text_path+'dataset_comb_test.txt',
block_size=512)

In [None]:
eval_dataset = TextDataset(
tokenizer=tokenizer,
file_path=text_path+'dataset_comb_val.txt',
block_size=512)

In [None]:
#Run 16 optimizing parameters
epochs = 5
batch_size = 8

training_args =TrainingArguments (
      output_dir='/content/drive/MyDrive/working_directory/tfm/Distilled GPT2 finetuning/output_model',
      num_train_epochs=epochs,
      per_device_train_batch_size=batch_size,
      warmup_steps=500,
      save_steps=2000,
      logging_steps=10,
      learning_rate=1e-5,
      
)

In [None]:
from datasets import load_metric
metric = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:

trainer = Trainer(  #AdamW / weigth decay
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=eval_dataset
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 9771
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6110
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


[34m[1mwandb[0m: Currently logged in as: [33mlxrandom[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,2.5574
20,2.4571
30,2.4521
40,2.423
50,2.3685
60,2.29
70,2.275
80,2.1761
90,2.2052
100,2.1853


Saving model checkpoint to /content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/output_model/checkpoint-2000
Configuration saved in /content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/output_model/checkpoint-2000/config.json
Model weights saved in /content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/output_model/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/output_model/checkpoint-4000
Configuration saved in /content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/output_model/checkpoint-4000/config.json
Model weights saved in /content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/output_model/checkpoint-4000/pytorch_model.bin
Saving model checkpoint to /content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/output_model/checkpoint-6000
Configuration saved in /content/drive/MyDrive/working directory/tfm

TrainOutput(global_step=6110, training_loss=1.5307064488749653, metrics={'train_runtime': 2477.6395, 'train_samples_per_second': 19.718, 'train_steps_per_second': 2.466, 'total_flos': 6382826375086080.0, 'train_loss': 1.5307064488749653, 'epoch': 5.0})

In [None]:
trainer.save_model()

Saving model checkpoint to /content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/output_model
Configuration saved in /content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/output_model/config.json
Model weights saved in /content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/output_model/pytorch_model.bin


In [None]:
model = AutoModelWithLMHead.from_pretrained('/content/drive/MyDrive/working_directory/tfm/Distilled_GPT2_finetuning/DistilGPT2_output_model')

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [None]:
def clean_paraphrase(input_sentence):
  p = generator('<s> '+input_sentence+' </s> === <p> ')
  return p[0]['generated_text'].split(' </s> === <p> ')[1].split(' </p>')[0]

## Testing training With tensorflow workflow

In [None]:
%env WANDB_PROJECT=distillgpt2Tensorflow

env: WANDB_PROJECT=distillgpt2Tensorflow


In [None]:
  dataset_comb_tr = pd.read_csv('/content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/datasets/Combined(PAWSX + Tapaco)/dataset_comb_tr.csv')
  dataset_comb_test = pd.read_csv('/content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/datasets/Combined(PAWSX + Tapaco)/dataset_comb_test.csv')
  dataset_comb_val = pd.read_csv('/content/drive/MyDrive/working directory/tfm/Distilled GPT2 finetuning/datasets/Combined(PAWSX + Tapaco)/dataset_comb_val.csv')

In [None]:
  dataset_comb_tr[dataset_comb_tr['combined'].map(type) != str]

Unnamed: 0,combined,lan


In [None]:
MODEL_CHECKPOINT='distilgpt2'

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = TFAutoModelForCausalLM.from_pretrained(MODEL_CHECKPOINT)

In [None]:
BATCH_SIZE = 8
NUM_EPOCHS = 5

tokenizer.pad_token = tokenizer.eos_token

In [None]:
dataset_comb_tr = Dataset.from_pandas(dataset_comb_tr)
dataset_comb_test = Dataset.from_pandas(dataset_comb_test)
dataset_comb_val = Dataset.from_pandas(dataset_comb_val)

In [None]:
dataset_comb_test

Dataset({
    features: ['combined', 'lan'],
    num_rows: 20153
})

In [None]:
def tokenize_function(example):
    return tokenizer(example['combined'], truncation=True, max_length=512) # same than block size

In [None]:
tokenized_train_dataset =   dataset_comb_tr.map(tokenize_function, batched=True)
tokenized_test_dataset =   dataset_comb_test.map(tokenize_function, batched=True)
tokenized_val_dataset =  dataset_comb_val.map(tokenize_function, batched=True)

  0%|          | 0/162 [00:00<?, ?ba/s]

  0%|          | 0/21 [00:00<?, ?ba/s]

  0%|          | 0/21 [00:00<?, ?ba/s]

In [None]:
tokenized_train_dataset

Dataset({
    features: ['combined', 'lan', 'input_ids', 'attention_mask'],
    num_rows: 161225
})

In [None]:
data_collator =  DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False,  return_tensors="tf")

In [None]:
train_ds = tokenized_train_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask','labels'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
	drop_remainder=True,
)
eval_ds = tokenized_train_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask','labels'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
	drop_remainder=True,
)
test_ds = tokenized_train_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask','labels'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
	drop_remainder=True,
)

In [None]:
trainlist=list(train_ds.as_numpy_iterator())

In [None]:
input_ids,attencion_mask = trainlist[0]

In [None]:
model(*trainlist[0][0])

In [None]:
import os
import psutil
import tensorflow as tf
from itertools import chain
from datasets import load_dataset
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from transformers import AutoTokenizer, TFAutoModelForCausalLM, DataCollatorWithPadding

num_train_steps = len(tf_ds) * NUM_EPOCHS
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5,
    end_learning_rate=0.0,
    decay_steps=num_train_steps)

In [None]:
opt = Adam(learning_rate=lr_scheduler)

In [None]:
model.compile(  # Adam with polinomial decay of lr
	optimizer=opt,
	loss=SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE),
)

#tf.keras.mixed_precision.set_global_policy("mixed_float16")

In [None]:
model.fit(train_ds, validation_data=eval_ds, epochs=NUM_EPOCHS, steps_per_epoch=len(tf_ds) // BATCH_SIZE)

In [None]:
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=1_000,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)


In [None]:
model.fit(train_ds, validation_data=eval_ds)