In [None]:
# import the packages.
import pandas as pd
from pathlib import Path
import string
import re
import nltk
import spacy
from textblob import TextBlob

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load dataset
data_folder = Path("/content/drive/MyDrive/HSLU/CTA/")
cleantech = data_folder / "cleantech_media_dataset_v1_20231109.csv"
# Inspect the data frame
df3 = pd.read_csv(cleantech)
df3.head()

In [None]:
# removing html tags
df3['content'] = df3['content'].apply(lambda x: re.sub('<[^<]+?>', '', x))

In [None]:
# Remove duplicates
df3.drop_duplicates(subset='content', keep='first', inplace=True)

In [None]:
## Newlines, spaces and tabs removal
df3['content'] = df3['content'].apply(lambda x: re.sub('\s+', ' ', x))

In [None]:
# Remove unused coloumns
df3.drop(['Unnamed: 0','author','domain','url'], axis=1, inplace=True)


In [None]:
df3.info()

In [None]:
df3.head()

#### Use BERT Extractive Summarizer to extract key sentence

In [None]:
!pip install bert-extractive-summarizer

In [None]:
# extract Keywords
from summarizer import Summarizer
model = Summarizer()
result = model(df3['content'][1], min_length=60)
result
# df3['key_sentences'] = df3['content'].apply(lambda x: summarizer.summarize(x))

In [None]:
df3['content'][1]


In [None]:
# use the last 100 rows
n=100

df_last100 = df3[-n:]

# extract the ke sentences
df_last100['key_sentences'] = df_last100['content'].apply(lambda x: model(x,min_length=30))

print(f'the last {n} key sentense finished!')

# save the output to a csv
df_last100.to_csv(data_folder / "last100.csv")

In [None]:
df_last100['content'][9602]

In [None]:
df_last100['key_sentences'][9602]

## Generate Questions and Answers

In [None]:
import pandas as pd


# Inspect the data frame
df_last100 = pd.read_csv('/content/drive/MyDrive/HSLU/CTA/last100.csv')

# Remove unused coloumns
df_last100.drop(['Unnamed: 0','title','date'], axis=1, inplace=True)
df_last100

### USE Transformer model to generate question


In [None]:
!pip install --no-cache-dir transformers sentencepiece

In [None]:
!python -m nltk.downloader punkt

In [None]:
# !wget https://github.com/xuexi21/CTA/blob/main/qa_generator.zip

In [None]:
# !ls


In [None]:
!unzip /content/drive/MyDrive/HSLU/CTA/qa_generator.zip -d /

In [None]:
%cd /content/question_generation

In [None]:
from pipelines import pipeline

In [None]:

# # Ensure that the GPU is available
# device = "cuda" if torch.cuda.is_available() else "cpu"

# print(f"Using device: {device}")

In [None]:
# https://huggingface.co/valhalla/t5-base-qg-hl
qa_generator = pipeline("question-generation", model="valhalla/t5-base-qg-hl")

In [None]:
# create a empty list to store the Q&A!
gen_text_list = []

# use for loop to generate Q&A
for i, text in enumerate(df_last100['key_sentences']):
  print(i)
  gen_text_list.append(qa_generator(text))




In [None]:

# SAVE the result to df
# gen_text_list
df_last100['QA'] = gen_text_list

In [None]:
# set the content index to check
n = 16

#PRINT THE KEY SENTENCE
print(df_last100['key_sentences'][n])

#PRINT THE QA
print(df_last100['QA'][n])

In [None]:
# save the stage to csv
data_folder = Path("/content/drive/MyDrive/HSLU/CTA/")
df_last100.to_csv(data_folder / "last100_QA.csv")

## Manually clean up the generated question-answer pairs to create a high-quality QA dataset.

In [None]:
# load the packages
import pandas as pd
from pathlib import Path
import re
import ast # convert the string to a list using the ast module


In [None]:
# mount the drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# load the stage data
data_folder = Path("/content/drive/MyDrive/HSLU/CTA/")
df_qa = pd.read_csv(data_folder / "last100_QA.csv")
# rename the column
df_qa = df_qa.rename(columns={"Unnamed: 0": "idx", "key_sentences": "summary",})
# select the usful column
df_qa = df_qa.iloc[:, [0,2,3]]

In [None]:
df_qa.keys()


In [None]:
# CLEAN the QA Content
df_QA_clean = pd.DataFrame({'summary':[],'Q':[],'A':[]})
# new_ind = 0
for i in df_qa.index:
    # if i == 5:
    #   break
    # print(f"\n\n index: {i} \n\n")


    # Convert the string representation of a list to a list of dictionaries
    qa_list = ast.literal_eval(df_qa['QA'].iloc[i])
    # print(f"then lenth of the QA IS {len(qa_list)}")

    for j, key_s in enumerate(qa_list):
        if "'" in key_s['answer']:
          continue
        else:
          # Append Dict as row to DataFrame
          new_row = {"summary": df_qa["summary"].iloc[i], "Q": key_s['question'],"A":key_s['answer']}
          df_QA_clean = df_QA_clean.append(new_row, ignore_index=True)

            # print(f"index:{i}, the {j} sentence contains a single quote in the 'answer' key.")
            # print(f"index:{i}, the {j} sentence is {key_s}")
            # print(f"ANSWER: {key_s['answer']}")
            # print(f"QUESTION: {key_s['question']}")



In [None]:
len(df_QA_clean)

In [None]:
# save the stage to csv
data_folder = Path("/content/drive/MyDrive/HSLU/CTA/")
df_QA_clean.to_csv(data_folder / "last100_QA_clean.csv")



In [None]:
## fine-tune GPT-2 or T5 and evaluate model performance on new input data in the cleantech field.


We used GPT 2 model in following study.

In [None]:
import seaborn as sns
import numpy as np
import time
import datetime
import random
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

torch.manual_seed(42)

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

import nltk
nltk.download('punkt')

In [None]:
!nvidia-smi

####  1 Load the dataset into a data frame


In [None]:
import pandas as pd
from pathlib import Path

# mount the drive
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# load into a data frame
data_folder = Path("/content/drive/MyDrive/HSLU/CTA/")
filename = "last100_QA_clean.csv"
df = pd.read_csv(data_folder /filename)


print(df.head())

In [None]:
# FOR CALCULATE THE MAX TOKEN LENTH,


test_Q = df.Q.copy()
test_A = df.A.copy()
print(test_Q)
print("\n\n")
print(test_A)

In [None]:
# calculate the max length of the tokens in 'answer'.
doc_lengths_A = []

for text in test_A:

    # get rough token count distribution
    tokens = nltk.word_tokenize(text)

    doc_lengths_A.append(len(tokens))

doc_lengths_A = np.array(doc_lengths_A)
# PLOT THE DENSITY
sns.distplot(doc_lengths_A)

# calculate the max length of the tokens in 'question'.
doc_lengths_Q = []

for text in test_Q:

    # get rough token count distribution
    tokens = nltk.word_tokenize(text)

    doc_lengths_Q.append(len(tokens))

doc_lengths_Q = np.array(doc_lengths_Q)
# PLOT THE DENSITY
sns.distplot(doc_lengths_Q)

In [None]:
# the max token length
doc_lengths_A.max() + doc_lengths_Q.max()

In [None]:
# the avg token length
np.average(doc_lengths_Q)

Even though these token counts won't match up to the BPE tokenizer's, I'm confident that most bios will be fit under the 60 embedding size limit for the small GPT2 model.


#### 2 GPT2 Tokenizer



In [None]:
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>') #gpt2-medium

In [None]:
print("The max model length is {} for this model, although the actual embedding size for GPT small is 60".format(tokenizer.model_max_length))
print("The beginning of sequence token {} token has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.bos_token_id), tokenizer.bos_token_id))
print("The end of sequence token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.eos_token_id), tokenizer.eos_token_id))
print("The padding token {} has the id {}".format(tokenizer.convert_ids_to_tokens(tokenizer.pad_token_id), tokenizer.pad_token_id))

#### 3 PyTorch Datasets & Dataloaders


I'm using the standard PyTorch approach of loading data in using a dataset class.

I'm passing in the tokenizer as an argument but normally I would instantiate it within the class.

In [None]:
class GPT2Dataset(Dataset):

  def __init__(self, df_qa, tokenizer, gpt2_type="gpt2", max_length=60):

    self.tokenizer = tokenizer
    self.question_tokens = []
    self.question_masks = []
    self.answer_tokens = []

    for index, row in df_qa.iterrows():

      q_dict = tokenizer('<|startoftext|>'+ '{Question:} ' + row['Q'] + ' {Answer:} ' + row['A']+ '<|endoftext|>', truncation=True, max_length=max_length, padding="max_length")
      self.question_tokens.append(torch.tensor(q_dict['input_ids']))
      self.question_masks.append(torch.tensor(q_dict['attention_mask']))
      # if index == 10:
      #   break
  def __len__(self):
    return len(self.question_tokens)

  def __getitem__(self, idx):
    return self.question_tokens[idx], self.question_masks[idx]

##### SPLIT THE DATASET for training and evaluation.

In [None]:
dataset = GPT2Dataset(df, tokenizer, max_length=60)

# Split into training and validation sets
train_size = int(0.999 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

In [None]:
batch_size = 2

##### CEATE the dataloader



In [None]:
# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

#### 4 Finetune GPT2 Language Model


##### SET THE CONFIGURATION

In [None]:
# set the configuration
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

##### set the parameters

In [None]:
# some parameters I cooked up that work reasonably well

epochs = 10
learning_rate = 1e-3 # 5e-4
warmup_steps = 0 # 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 31

##### set the optimizer

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

In [None]:
##### SET THE TRAINING STEPS AND SCHEDULER

In [None]:
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = warmup_steps,
                                            num_training_steps = total_steps)

In [None]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [None]:
total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()

        outputs = model(  b_input_ids,
                          labels=b_labels,
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
        #     print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

        #     sample_outputs = model.generate(
        #                             bos_token_id=random.randint(1,30000),
        #                             do_sample=True,
        #                             top_k=50,
        #                             max_length = 200,
        #                             top_p=0.95,
        #                             num_return_sequences=1
        #                         )
            # for i, sample_output in enumerate(sample_outputs):
            #       print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():

            outputs  = model(b_input_ids,
#                            token_type_ids=None,
                             attention_mask = b_masks,
                            labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

##### SHOW THE TRAING RESULTS

In [None]:
# Display floats with two decimal places.
pd.set_option('display.precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

In [None]:
# Use plot styling from seaborn.
import matplotlib.pyplot as plt

sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

plt.show()

In [None]:
#####  Display Model Info

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The GPT-2 model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:2]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[2:14]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

##### Generate Text

In below cell, you can change the question number from 0 - 374 . in result will show the question from the traning data. you also can change the question sentence to check the result of our fine-tuned model.

In [None]:

# define question number
N = 333
# GOOD :
# NOT GOOD :34


model.eval()
# change a way to ask the question

# question = 'which month did SDG & E begin commercial operation? '
question = df['Q'].iloc[N,]


prompt = '<|startoftext|>' + question + 'answer:'
print('\n\n')
print(df['Q'][N], df['A'][N])
print('\n')
print('prompt:  '+ question)
print('\n')

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

sample_outputs = model.generate(
                                generated,
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,
                                top_k=50,
                                max_length = 60,
                                top_p=0.95,
                                num_return_sequences=3
                                )

import re
def extract_answer(text):
    # Define a regex pattern to capture the information after "answer:}"
    pattern = r'answer:}\s*([^<]+)<|endoftext|>/.'



    # Use re.search to find the pattern in the text
    match = re.search(pattern, text)

    # Check if the pattern was found
    if match:
        return match.group(1)
    else:
        return None


for i, sample_output in enumerate(sample_outputs):
  answer_text = tokenizer.decode(sample_output, skip_special_tokens=False)
  result = extract_answer(answer_text)

  print("{}: {}\n\n".format(i, result))
  # print("{}: {}\n\n".format(i, answer_text))

Adding the result column to df

In [None]:
df.head()
# N=3
# # df['Q'][0]
# print(f"number: {N} ,  {df['Q'][N]}, {df['A'][N]}")

In [None]:
# use for loop to generate the answer list.
fine_tune_answer = []

model.eval()


for N,Q in enumerate(df['Q']):

  question = Q

  prompt = '<|startoftext|>' + question + 'answer:'


  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  generated = generated.to(device)

  sample_outputs = model.generate(
                                  generated,
                                  #bos_token_id=random.randint(1,30000),
                                  do_sample=True,
                                  top_k=50,
                                  max_length = 50,
                                  top_p=0.95,
                                  # because we need to use only one result to adden in the column so set the number of return as 1.
                                  num_return_sequences=1
                                  )




  for i, sample_output in enumerate(sample_outputs):
    answer_text = tokenizer.decode(sample_output, skip_special_tokens=False)
    result = extract_answer(answer_text)
    # now add the fine_tuned_model answer to the list

    fine_tune_answer.append(result)




In [None]:
#check the lenth is correct
# len(df['A']) == len(fine_tune_answer)

# ADD THE ANSWER TO df
df['fine_tuned_answer'] = fine_tune_answer

df


## RAG

#### SET UP PINECONE

In [None]:
#set the sentence embedding database with pine-client
!pip install pinecone-client

In [None]:
# load the packages
from pinecone import Pinecone, ServerlessSpec

In [None]:
# Registor your pipecone account, set up a index name as'clean-energy' with 384 dimentions.


'''
crearte 'clean_energy' index in Pinecone, set up the index dimention as 384.
because i USE THE hugging face 'model sentence-transformers/paraphrase-MiniLM-L6-v2'


here is the model stuctor:
SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel
  (1): Pooling({
    'word_embedding_dimension': 384,
    'pooling_mode_cls_token': False,
    'pooling_mode_mean_tokens': True,
    'pooling_mode_max_tokens': False,
    'pooling_mode_mean_sqrt_len_tokens': False})
)

'''
# use the api
pc = Pinecone(api_key='d6e4416e-1947-4cbb-b77f-88efeee7183e')
index_name = 'clean-energy'

In [None]:
# CHECK THE INDEX info, it should be empty
index = pc.Index(index_name)
index.describe_index_stats()

#### sentence embeding from stage3

In [None]:
# https://huggingface.co/sentence-transformers

!pip install -U sentence-transformers

In [None]:
# read the df
df1 = pd.read_csv(data_folder/'last100_QA.csv',index_col=0)

#define the function to clean the text
def clean_text_list(txt):
  txt_list = txt[2:].split("', '")
  return [txt.strip(' ') for txt in txt_list]

In [None]:
#### prepare data for Pinecone

 - creating list of vector of embeddings

In [None]:
embedding_list = []
original_dict = {}
for i, txt in enumerate(df1['key_sentences']):
  for j, s in enumerate(clean_text_list(txt)):
    s_id = f'{i}_{j}'
    s_value = se.encode(s).tolist()
    original_dict[s_id] = s
    embedding_list.append(
        {"id": s_id, "values": s_value}
    )
print(len(embedding_list))

#### UPSERT TO PIPECONE

Use the upsert operation to write 100 384-dimensional vectors into pinecone

In [None]:
ns = "last100"
index.upsert(
  vectors=embedding_list,
  namespace=ns
)

In [None]:
# check that we have all vectors in index
index.describe_index_stats()

In [None]:
# gets context passages from the pinecone index
def get_context(index, question, top_k):
    # generate embeddings for the question
    xq = se.encode(question).tolist()
    # search pinecone index for context passage with the answer
    xc = index.query(
                      namespace=ns,
                      vector=se.encode(question).tolist(),
                      top_k=top_k,
                      include_values=False
                    )
    # extract the context passage from pinecone search result
    return xc["matches"]

In [None]:
# find a quesiton and answer.
df[['Q','A']].iloc[118]

In [None]:
# set the question and query the from the embedding database

question = "Where is the Allen S. King plant located?"
context_list = get_context(index, question, top_k=5)

context_list

#### Question-answering Pipeline

In [None]:
# https://huggingface.co/docs/transformers/v4.17.0/en/task_summary#extractive-question-answering
from transformers import pipeline
question_answerer = pipeline("question-answering")

In [None]:
# check the results
for c in context_list:
  c_txt = original_dict[c['id']]
  result = question_answerer(question=question, context=c_txt)
  print()
  print(f"Embedding score: {c['score']}")
  print(
    f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}"
)