In [None]:
!pip install PyPDF2
!pip install -qq -U keras>=3
!pip install -qq -U keras-nlp
!pip install -U sentence-transformers
!pip install -qq -U /kaggle/working/sentence-transformers
!pip install -qq -U /kaggle/input/blingfire-018/blingfire-0.1.8-py3-none-any.whl
!pip install -qq -U pip ipywidgets jupyter Pyarrow tensorflow-cpu tensorflow-hub tensorflow-text faiss-gpu
!cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers/kaggle/working/sentence-transformers

In [None]:
#import libraries
import faiss
import keras
import json
import PyPDF2
import keras_nlp
import numpy as np
import pandas as pd
import blingfire as bf

from tqdm.notebook import tqdm
from collections.abc import Iterable
from nltk.tokenize import sent_tokenize
from IPython.display import display, Markdown
from sentence_transformers import SentenceTransformer

import os
os.environ["KERAS_BACKEND"] = "jax"
os.environ["XLA_PYTHON_CLIENT_MEM_FRACTION"] = "1.00"

import warnings
warnings.filterwarnings(action= 'ignore')

# Question Generation

In [None]:
text = ''
with open('/kaggle/input/pdf-input/P1_7pg_Python_DA_Fabio.pdf', 'rb') as file:
    reader_pdf = PyPDF2.PdfReader(file)
    for i in range(len(reader_pdf.pages)):
        page = reader_pdf.pages[i]
        text+= page.extract_text()
text = text.replace('\n', '')
sentences = sent_tokenize(text)

In [None]:
# Loading Instruct Gemma_2b
gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset("gemma_instruct_2b_en")
gemma_lm.summary()

In [None]:
prompt = f'Can u generate 25 questions from this {text}?'
answer = gemma_lm.generate(prompt, 6000)
questions = list()
sentences = answer.split('\n')
for sentence in sentences:
    if sentence != '':
        starting_of_sentence = sentence[0]
        try:
            if int(starting_of_sentence):
                questions.append(sentence) 
        except:
            pass        

In [None]:
prompt = f'How many relevant questions can you generate from this {text}?'
answer = gemma_lm.generate(prompt, 8500)
 

# Question Answering after RAG without based on PDF File.

In [None]:
answers_before_rag = list()
for question in tqdm(questions):
    answer = gemma_lm.generate(question, max_length= 128)
    if len(answer.split('Answer')) >=2:
        
        answer = answer.split('Answer')[1].replace('*', '')
        answer = answer.replace('\n', '')
        answer = answer.replace(':', '')
    else:
        
        answer = answer.replace('\n', '')
        answer = answer.replace(':', '')
    answers_before_rag.append(answer)

In [None]:
df = pd.DataFrame(zip(questions, answers_before_rag), columns= ['Questions', 'Answers_before_RAG_without_based_on_PDF_File'])
for j, i in tqdm(enumerate(df['Answers_before_RAG_without_based_on_PDF_File'].values)):
    if 'What' in i or 'How' in i:
        df.loc[j, 'Answers_before_RAG_without_based_on_PDF_File'] = df.loc[j, 'Answers_before_RAG_without_based_on_PDF_File'].split('?')[1]
for i in df['Questions']:
    df['Questions'] = df['Questions'].str.replace(i[:3], '')
df['Questions'] = df['Questions'].str.replace('1', '')
df['Questions'] = df['Questions'].str.replace('2', '')

# Question Answering after RAG based on PDF File.

In [None]:
template = """
Context: {context}

System: As a Data analyst, You are gonna asnwer questions based on {pdf_file}.
Question: {instruction}

Answer: {response}
"""

In [None]:
answers_based_on_pdf_file = list()
for question in tqdm(df['Questions']):
    prompt = template.format(
        context= "",
        pdf_file= text,
        instruction = question,
        response = "",
    )
    answer_based_on_pdf_file = gemma_lm.generate(prompt, max_length= 2056)
    answer_based_on_pdf_file = answer_based_on_pdf_file.split('Answer')[-1]
    answer_based_on_pdf_file = answer_based_on_pdf_file.replace(':', '')
    answer_based_on_pdf_file = answer_based_on_pdf_file.replace(' \n', '')
    answers_based_on_pdf_file.append(answer_based_on_pdf_file)
df['Answers_before_RAG_based_on_PDF_File'] = answers_based_on_pdf_file

In [None]:
df

# Question Answering with Wikipedia RAG

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
model.max_seq_length = 512
sentence_index = faiss.read_index("/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index")
wiki_files_path = "/kaggle/input/wikipedia-20230701"
wiki_index_path = f"{wiki_files_path}/wiki_2023_index.parquet"
batch_size = 64
num_sentences_include = 5

In [None]:
answers_after_RAG = list()
for question in df['Questions']:
    #search relevant context
    query_embeddings = model.encode(question, batch_size= batch_size, show_progress_bar= True, convert_to_tensor= True, normalize_embeddings= True)
    query_embeddings = query_embeddings.detach().cpu().numpy().reshape(1, -1)
    search_score, search_index = sentence_index.search(query_embeddings, 10)
    search_index = search_index.flatten()
    
    #get wiki files
    wiki_df = pd.read_parquet(wiki_index_path, columns= ['id', 'file'])
    wiki_files = wiki_df.iloc[search_index].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)

    #wiki text
    #get wiki text
    wiki_text = []
    for file in tqdm(wiki_files.file.unique(), total=wiki_files.file.unique().size):
        idx = [str(i) for i in wiki_files[wiki_files['file'] == file]['id'].tolist()]
        temp_wiki = pd.read_parquet(f"{wiki_files_path}/{file}", columns=['id', 'text'])
        temp_df = temp_wiki[temp_wiki['id'].isin(idx)].copy()
        wiki_text.append(temp_df)
    wiki_text = pd.concat(wiki_text).drop_duplicates().reset_index(drop=True)
    
    #extract context
    wiki_embeddings = model.encode(wiki_text['text'].tolist(),
                                                batch_size= batch_size,
                                                show_progress_bar= True,
                                                convert_to_tensor= False,
                                                normalize_embeddings= True)
    wiki_embeddings = np.array(wiki_embeddings)
    dimension = wiki_embeddings.shape[1]
    prompt_index = faiss.IndexFlatL2(dimension)
    prompt_index.add(wiki_embeddings)
    D, I = prompt_index.search(query_embeddings, num_sentences_include)

    contexts = []
    for i in I[0]:
        context = wiki_text['text'].iloc[i]
        contexts.append(context)

    contexts = ' '.join(contexts)
    
    template = """
    Context: {context}

    System: You are the Data Analyst, please answer the questions.
    Question: {instruction}

    Answer: {response}
    """
    
    #process query
    query = question
    promt_rag = template.format(context=contexts, instruction=query, response="")
    answer_after_rag = promt_rag.split('Answer')[-1]
    answer_after_rag = answer_after_rag.replace(': ', '')
    answer_after_rag = answer_after_rag.replace('\n', '')
    answers_after_RAG.append(answer_after_rag)

In [None]:
df['Answers_after_RAG'] = answers_after_RAG

# Fine Tuning with LoRA

In [None]:
questions = ['1. What is the purpose of data analysis?',
             '2. What is the difference between information and data?',
             '3. What is data analysis?',
             '4. What are the different types of data?',
             '5. What is the data analysis process?',
             '6. What is the difference between data analysis and model building?',
             '7. What is the role of data visualization in data analysis?',
             '8. What are the different types of data visualization?',
             '9. What is the purpose of data exploration and visualization?',
             '10. What is the purpose of predictive modeling?',
             '11. How does the predictive power of a model depend on the quality of modeling techniques?',
             '12. What is the importance of choosing a good dataset for data analysis?',
             '13. What are the preliminary activities of data analysis?',
             '14. What is the purpose of data cleaning?',
             '15. What is the purpose of data transformation?',
             '16. What is the purpose of data exploration and visualization?',
             '17. What is the purpose of predictive modeling?',
             '18. How does data analysis contribute to professional activities?',
             '19. What are the tools and methodologies required for data analysis?',
             '20. What is the role of interdisciplinary team members in data analysis?',
             '21. What are the different types of categorical data?',
             '22. What are the different types of numerical data?',
             '23. What is the purpose of data analysis in different fields of applications?',
             '24. What is the purpose of data analysis in a world increasingly centralized around information technology?',
             '25. What are the challenges and opportunities associated with data analysis?']


In [None]:
data = []
with open('/kaggle/input/databricks-dolly-15k/databricks-dolly-15k.jsonl') as file:
    for line in file:
        features = json.loads(line)
        if features["context"]:
            continue
        template = """
        Question: {instruction}

        Response: {response}
        """
        data.append(template.format(**features))
        
data = data[:1000]

In [None]:
prompt = template.format(
    instruction = questions[0],
    response = "",
)
print(gemma_lm.generate(prompt, max_length = 256))

In [None]:
gemma_lm.backbone.enable_lora(rank= 4)
gemma_lm.summary()

In [None]:
gemma_lm.preprocessor.sequence_length = 512
optimizer = keras.optimizers.AdamW(
    learning_rate = 0.0001,
    weight_decay = 0.01,
)

optimizer.exclude_from_weight_decay(var_names= ["bias", "scale"])

gemma_lm.compile(
    loss= keras.losses.SparseCategoricalCrossentropy(from_logits= True),
    optimizer= optimizer,
    weighted_metrics= [keras.metrics.SparseCategoricalAccuracy()],
)
gemma_lm.fit(data, epochs= 1, batch_size= 1)

In [None]:
prompt = template.format(
    instruction = questions[0],
    response = "",
)
print(gemma_lm.generate(prompt, max_length=256))
