In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING']="1"

In [None]:
! pip install transformers
! pip install --upgrade accelerate
! pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.1-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m

In [None]:
import pandas as pd
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Preparing dataset

## Setting up raw SQuAD

In [None]:
from datasets import load_dataset, list_datasets

# Load the SQuAD dataset
dataset = load_dataset('squad')

Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# Extract paragraphs and associated question-answer pairs from the dataset

def split_para_qa(dataset, type):
  para = [example['context'] for example in dataset[type]]
  qa = [(example['question'], example['answers']['text'][0]) for example in dataset[type]]

  return para, qa

In [None]:
train_paragraphs, train_qa_pairs = split_para_qa(dataset, 'train')
valid_paragraphs, valid_qa_pairs = split_para_qa(dataset, 'validation')

In [None]:
def split_qa(qa):
  questions = []
  answers = []

  for x, y in qa:
    questions.append(x)
    answers.append(y)
  return questions, answers

In [None]:
train_questions, train_answers = split_qa(train_qa_pairs)
valid_questions, valid_answers = split_qa(valid_qa_pairs)

In [None]:
train_df = pd.DataFrame({
   "paragraph" : train_paragraphs,
   "question" : train_questions,
   "answer" : train_answers
})

In [None]:
train_df.paragraph[1]

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [None]:
valid_df = pd.DataFrame({
   "paragraph" : valid_paragraphs,
   "question" : valid_questions,
   "answer" : valid_answers
})

In [None]:
# Limiting the length of answers (max = 2 words)

def limit_answer_length(dataset, max_length=3):
  df = dataset[dataset['answer'].str.split().str.len() < max_length]
  df = df.reset_index(drop=True)
  return df

In [None]:
altrered_train_df = limit_answer_length(train_df, max_length = 3)
altrered_valid_df = limit_answer_length(valid_df, max_length = 3)

## Setting up custom dataset

In [None]:
# altrered_train_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/squad_summarized_qa.csv")

In [None]:
# altrered_train_df = altrered_train_df.drop('Unnamed: 0', axis=1)
# altrered_train_df = altrered_train_df.reset_index(drop=True)

In [None]:
# from sklearn.utils import shuffle

# altrered_train_df = shuffle(altrered_train_df)
# altrered_train_df = altrered_train_df.reset_index(drop=True)

In [None]:
altrered_train_df.head()

Unnamed: 0,paragraph,question,answer
0,"In 1848, chopin's popularity as a virtuoso beg...",When was Chopin's last concert?,1849
1,He crashed the stage and grabbed the microphon...,Who cancelled West's tour?,Gaga
2,"Over 200,000 were living in new york by 1860, ...",What caused an influx of immigrants?,Famine
3,"On 12 march 1999, the czech republic, hungary,...",When did Albania join U.N.?,2009
4,The 2010 human development report by the unite...,When was the report released?,November 4th


In [None]:
def split_train_test(dataset, training_set_size = 0.8):
  # Calculate the split point
  split_point = int(training_set_size * len(dataset))
  # Split the dataset into training and testing sets
  train_set = dataset[:split_point]
  test_set = dataset[split_point:]

  train_set = train_set.reset_index(drop=True)
  test_set = test_set.reset_index(drop=True)
  return train_set, test_set

In [None]:
training_set, validating_set = split_train_test(altrered_train_df, training_set_size = 0.9)
training_set, testing_set = split_train_test(training_set, training_set_size = 0.9)

In [None]:
# Print the sizes of the training and testing sets
print("Training set size:", len(training_set))
print("Validating set size:", len(validating_set))
print("Testing set size:", len(testing_set))

Training set size: 6101
Validating set size: 754
Testing set size: 678


In [None]:
training_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6101 entries, 0 to 6100
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   paragraph  6101 non-null   object
 1   question   6101 non-null   object
 2   answer     6101 non-null   object
dtypes: object(3)
memory usage: 143.1+ KB


In [None]:
testing_set.head(15)

Unnamed: 0,paragraph,question,answer
0,"According to the Boston Herald, dated July 23,...",How expensive was Kerry's yacht?,$7 million
1,"According to the Boston Herald, dated July 23,...",Where was Kerry's yacht built?,New Zealand
2,"According to the Boston Herald, dated July 23,...",How much sales tax did Kerry owe on the yacht?,"$437,500"
3,"According to the Boston Herald, dated July 23,...",Which state received Kerry's yacht's sales tax?,Massachusetts
4,Rajasthan (/ˈrɑːdʒəstæn/ Hindustani pronunciat...,How many square kilometers is Rajasthan?,342239
5,Rajasthan (/ˈrɑːdʒəstæn/ Hindustani pronunciat...,What percentage of India's total area is Rajas...,10.4%
6,Rajasthan (/ˈrɑːdʒəstæn/ Hindustani pronunciat...,What province in Pakistan does Rajasthan borde...,Punjab
7,Rajasthan (/ˈrɑːdʒəstæn/ Hindustani pronunciat...,The Dilwara Temples are a pilgrimage site for ...,Jain
8,"The first mention of the name ""Rajasthan"" appe...",When was the first usage of the word Rajasthan?,1829
9,"The first mention of the name ""Rajasthan"" appe...",What is another name for the Rajasthan region?,Rajputana


In [None]:
def para_qa_visualization(df, index):
  print("Paragraph : ", df.paragraph[index])
  print("Question : ", df.question[index])
  print("Answer : ", df.answer[index])

In [None]:
para_qa_visualization(training_set, 25)

Paragraph :  In 2015-2016, Notre Dame ranked 18th overall among "national universities" in the United States in U.S. News & World Report's Best Colleges 2016. In 2014, USA Today ranked Notre Dame 10th overall for American universities based on data from College Factual. Forbes.com's America's Best Colleges ranks Notre Dame 13th among colleges in the United States in 2015, 8th among Research Universities, and 1st in the Midwest. U.S. News & World Report also lists Notre Dame Law School as 22nd overall. BusinessWeek ranks Mendoza College of Business undergraduate school as 1st overall. It ranks the MBA program as 20th overall. The Philosophical Gourmet Report ranks Notre Dame's graduate philosophy program as 15th nationally, while ARCHITECT Magazine ranked the undergraduate architecture program as 12th nationally. Additionally, the study abroad program ranks sixth in highest participation percentage in the nation, with 57.6% of students choosing to study abroad in 17 countries. According

In [None]:
def training_prompt(dataset, order = 'pqa'):
  modified_paragraph = []
  for i in dataset.paragraph :
   modified_paragraph.append("<CONTEXT_START> " + i + " <CONTEXT_END>")

  modified_question = []
  for i in dataset.question :
   modified_question.append("<QUESTION_START> " + i + " <QUESTION_END>")

  modified_answer = []
  for i in dataset.answer :
   modified_answer.append("<ANSWER_START> " + i + " <ANSWER_END>")

  prompts = []
  if(order == 'pqa'):
    for i in range(len(modified_paragraph)):
      prompts.append("<START> " + modified_paragraph[i] + " " +
                     modified_question[i] + " " +
                     modified_answer[i] + " <END>")

  elif (order == 'paq') :
    for i in range(len(modified_paragraph)):
      prompts.append("<START> " + modified_paragraph[i] + " " +
                     modified_answer[i] + " " +
                     modified_question[i] + " <END>")

  df = pd.DataFrame({
   "syntax" : prompts
  })

  return df

In [None]:
# pqa : paragraph-question-answer | paq : paragraph-answer-question
training = training_prompt(training_set, order = 'paq')
validating = training_prompt(validating_set, order = 'paq')

In [None]:
training.iloc[15, 0]

"<START> <CONTEXT_START> The season six premiere drew a massive audience of 37.3 million viewers. The episode topped out in the last half hour with more than 41 million people watching. Cnn.com's john sutter reports that the premiere was the most watched episode of the show in seven years. Click here to watch the full season 6 premiere. <CONTEXT_END> <ANSWER_START>  37 <ANSWER_END> <QUESTION_START>  How many viewers watched the season 6 premiere? <QUESTION_END> <END>"

In [None]:
# # Reducing dataset

# percentage = 0.25  # 25% of the rows

# num_rows = int(len(qa_df_train) * percentage)
# qa_df_train = qa_df_train.sample(n=num_rows)

# num_rows = int(len(qa_df_test) * percentage)
# qa_df_test = qa_df_test.sample(n=num_rows)

In [None]:
training.to_csv("training.csv")
validating.to_csv("validating.csv")
testing_set.to_csv("testing_set.csv")

# Finetunning GPT Neo

In [None]:
model_name = "EleutherAI/gpt-neo-125M"

output_dir = "/content/drive/MyDrive/tf finetune v1"

model = GPTNeoForCausalLM.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

In [None]:
dataset_train = TextDataset(
    tokenizer=tokenizer,
    file_path="/content/training.csv",
    block_size=128,  # Set an appropriate block size
)



In [None]:
dataset_valid = TextDataset(
    tokenizer=tokenizer,
    file_path="/content/validating.csv",
    block_size=128,  # Set an appropriate block size
)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
output_dir = "/content/drive/MyDrive/SQuAD_summary_old_paq_GPT_Neo_finetuned/"

In [None]:
training_args = TrainingArguments(
    output_dir = output_dir + "results/",
    num_train_epochs=10,  # Set an appropriate number of epochs
    per_device_train_batch_size=4,  # Set an appropriate batch size
    save_steps=10000,
    save_total_limit=2,
    overwrite_output_dir=True,
    learning_rate=1e-4,
    logging_dir= output_dir + "logs/",
    logging_steps=100,
    save_strategy="steps",
)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset = dataset_valid,
    data_collator=data_collator,
)

In [None]:
trainer.train()

Step,Training Loss
100,2.3051
200,2.321
300,2.2649
400,2.1673
500,2.1002
600,2.0243
700,1.969
800,1.9224
900,1.8607
1000,1.8287


Step,Training Loss
100,2.3051
200,2.321
300,2.2649
400,2.1673
500,2.1002
600,2.0243
700,1.969
800,1.9224
900,1.8607
1000,1.8287


TrainOutput(global_step=16320, training_loss=0.4671348483685185, metrics={'train_runtime': 3635.8068, 'train_samples_per_second': 17.955, 'train_steps_per_second': 4.489, 'total_flos': 4262899828654080.0, 'train_loss': 0.4671348483685185, 'epoch': 10.0})

In [None]:
# Saving model weights
model.save_pretrained("/content/drive/MyDrive/SQuAD_summary_paq_GPT_Neo_finetuned_done")