In [None]:
output_max_length = 1024

In [None]:
from google.colab import drive
drive.mount ("/content/drive/")
import os
os.chdir ("/content/drive/MyDrive/TEST")

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import pandas as pd
import re

In [None]:
df_QA = pd.read_csv("QA_pairs.csv")

In [None]:
def clean_text(text):
  if pd.isna(text): return ""
  if not isinstance(text, str): text = str(text)
  return re.sub(r'\s+', " ", text.lower().strip())
def df_prepare_dataset(df):
  df["Question"], df["Dataset_Answer"] = df["Question"].apply(clean_text), df["Dataset_Answer"].apply(clean_text)
  df = df[(df["Question"] !="") & (df["Dataset_Answer"] !="")]
  return df
def clean_generated(generated):
  temp = re.sub(r'[\n|\\|\x0f-\x1f|\x7f-\xff]', "", generated)
  temp = re.sub(r'\s+', " ", temp).capitalize()
  temp = re.sub(r'\.{4,}', '...', temp)
  temp = re.sub(r'\.([a-zA-Z])', r'. \1', temp)
  temp = re.sub(r'(?<=[\.\?\!]\s)(\w)|^(\w)', lambda m: m.group().upper(), temp)
  return temp

In [None]:
def get_answer(question, model):
    device = next(model.parameters()).device
    input = tokenizer(
        f"question: {clean_text(question)} answer: ", #prompt
        return_tensors="pt",  #no need to pad. If pad make additional pairs, which result in worse result, waste memory, waste time
    ).to(device)
    output = model.generate(
        **input,
        do_sample = True, temperature = 0.7, top_p = 0.9,
        max_length = output_max_length, num_return_sequences = 1, no_repeat_ngram_size = 2, #no need to pad. Let model stop natually or at max_length
    )
    generated = tokenizer.decode(output[0], skip_special_tokens=True).split("answer:")[-1].strip() #only get tedt after "question: [question body] answer:"
    return clean_generated(generated)

In [None]:
from transformers import AutoModelForCausalLM, GPT2TokenizerFast
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
Baseline_GPT2=[]
for i in range(100):
    Baseline_GPT2.append(get_answer(df_QA.iloc[i]["Question"], model))
df_QA["GPT2_Answer"]=Baseline_GPT2

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [None]:
model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/TEST/MODEL/train_gpt2")
tokenizer = GPT2TokenizerFast.from_pretrained("/content/drive/MyDrive/TEST/MODEL/train_gpt2")
tokenizer.pad_token = tokenizer.eos_token
Finetuned_GPT2=[]
for i in range(100):
    Finetuned_GPT2.append(get_answer(df_QA.iloc[i]["Question"], model))
df_QA["GPT2_Finetuned"]=Finetuned_GPT2

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [None]:
df_QA.to_csv("QA_pairs.csv",index = False)
df_QA

Unnamed: 0,Question,Dataset_Answer,LLAMA_Answer,LLAMA_Finetuned,GPT2_Answer,GPT2_Finetuned
0,"Why is the ""Black Death"" mentioned so often in...",I have an unpublished paper on this subject th...,"The term ""Black Death"" is often mentioned in W...",The Black Death is a very famous event in Euro...,There is no black death in chinese history at ...,~~chinese people didn't kill the black people....
1,How historically accurate is the show The Last...,Specific follow up - Does it make any sense at...,"The Last Kingdom, a historical drama series ba...",I've watched the first season of the show and ...,I think the first season is much more accurate...,"Ianah, i thought you were just getting downvot..."
2,If there was a lack of food supply in the worl...,> vitamins and a high concentration of calorie...,While it's theoretically possible to survive o...,There's a lot of food out there. You just need...,"Yes, but the food is also a problem, and it is...","Ianas, the problem with food shortages is that..."
3,"If our DNA varies from person to person, how d...",It's true that if one were to be calculating t...,Humans and orangutans share a significant amou...,"Humans have 23 pairs of chromosomes, and each ...",ㅇ㇠ㄅㆅ▶ leader 03/06/15 (tue) 06:02:31 id: 8d0d3...,"~~this is not an answer to your question, but ..."
4,Why would a company spend millions on a long S...,"It's a quick way to build buzz. I mean, here w...",There are several reasons why a company might ...,"If you want to get a lot of attention, you wan...","Well, not really. It's the nature of the beast...",Ive worked for a large company and have notice...
...,...,...,...,...,...,...
95,The difference between a learning disability a...,Intellectual disability - basically I'm not sm...,The main differences between a learning disabi...,"A learning disability is a learning disorder, ...",This question is a bit of a mystery. I can't r...,~~learning disability~~ intellectual disabilit...
96,Why do I get headaches from 3D movies?,"Order some [2-D Glasses](_URL_1_). Yeah, they'...",There are several reasons why you might experi...,The reason why you get headaches from 3D movie...,I just want to know what's wrong with my eyes ...,Ive heard that the motion sickness is caused b...
97,Why do people have different skin colours?,Different skin colours help in different clima...,People have different skin colors due to a com...,Humans are a lot like plants. We have a lot of...,"I'm not sure what it's about, but it seems to ...",~~some people are born with darker skin color ...
98,"How did girlscout cookies get to be so good, a...",Keebler makes them. They are plenty successful...,Girl Scout cookies have been a beloved treat f...,The Girl Scouts have a strict recipe that they...,"They don, but not all companies make cookies. ...",Ian kershaw has written a lot on this subject ...
