### Import Library

In [None]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

seed = 42
with_torch =True
with_cuda = True
random.seed(seed)
np.random.seed(seed)

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, AutoModelForMaskedLM
from huggingface_hub import login
torch.cuda.is_available()

Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

### Loading Data

In [None]:
train_path = '/kaggle/input/sogang-nlp-rag/qa_train.json'
with open(train_path,"r") as json_file:
    train = json.load(json_file)

test_path = '/kaggle/input/sogang-nlp-rag/qa_test.json'
with open(test_path,"r") as json_file:
    test = json.load(json_file)

wiki_path = '/kaggle/input/sogang-nlp-rag/processed_wikipedia.txt/processed_wikipedia.txt'
with open(wiki_path, 'r') as f:
    wiki_data = f.readlines()

### Loading Model

In [None]:
hf_token = "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"
login(token=hf_token)

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
model.to('cuda')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaR

### Preprocessing Wiki Data

In [None]:
def remove_special_characters(sentence):
    return re.sub(r'[^A-Za-z0-9\s]', '', sentence)

In [None]:
clean_wiki = [remove_special_characters(sentence) for sentence in wiki_data]

### RAG Architecture

In [None]:
class VectorDatabase:
    def __init__(self, document_data):
        self.documents = document_data
        self.vectorizer = TfidfVectorizer()
        self.document_embeddings = self.vectorizer.fit_transform(document_data)

    def search_similar_doc(self, query):
        query_embedding = self.vectorizer.transform([query])
        similarities = cosine_similarity(query_embedding, self.document_embeddings)[0]
        best_index = similarities.argmax()
        return self.documents[best_index]

def generate_with_prompt_tuning(query):
    prompt = f"Answer the following question with a one word only.: {query}\nAnswer:"
    inputs = tokenizer.encode(prompt, return_tensors='pt',max_length=512, truncation=True, padding='max_length').to("cuda")
    outputs = model.generate(inputs, max_length=1024, num_return_sequences=1, temperature=0.5, do_sample=True).to("cuda")
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if 'Answer:' in response:
        response = response.split("Answer:")[1].strip()
        if '\n' in response:
            response = response.split('\n')[0]
        else:
            response = response
    else:
        response = response.strip()
        if '\n' in response:
            response = response.split('\n')[0]
        else:
            response = response
    return response

def rag(document_data, queries):
    vector_db = VectorDatabase(document_data)
    predictions = {
        "answer": {},
        "sp": {}
    }

    for idx, query in tqdm(enumerate(queries), total=len(queries)):
        relevant_document = vector_db.search_similar_doc(query['question'])
        generated_answer = generate_with_prompt_tuning(query['question'])
        predictions['answer'][str(idx)] = generated_answer
        predictions['sp'][str(idx)] = relevant_document

    return predictions

In [None]:
queries = {}
answers = {}

for i in range(5):
    query_key = f'query_{i}'
    answer_key = f'answer_{i}'

    queries[query_key] = test_df['question'][i]
    answers[answer_key] = generate_with_prompt_tuning(queries[query_key])

df = pd.DataFrame({'query':queries.values(),
              'answer':answers.values()})
df

Unnamed: 0,query,answer
0,Mike Huckabee took part in the most recent Rep...,2016
1,Pabst Brewing Company is currently a holding c...,Anheuser-Busch
2,David Levien also wrote what film about two fr...,The Gambler.
3,Who was the main designer of the horror video ...,Wes Craven.
4,What kind of film making do René Clément and E...,Short films.


### Running Rag

In [None]:
# Test Running
predictions = rag(document_data=clean_wiki,
                  queries=test[:10])
predictions

In [None]:
# Real Running
predictions = rag(document_data=clean_wiki[:100000],
                  queries=test)
index_key = list(predictions['answer'].keys())
answers = list(predictions['answer'].values())
queries = list(predictions['sp'].values())
submission = pd.DataFrame({'id':index_key,'sentences':answers,'queries':queries}).astype(str)
submission.head()

### Post Precessing

In [None]:
# python -m spacy download en_core_web_md 사용시 한번만 다운로드 해주세요

In [None]:
import spacy
from symspellpy import SymSpell

# Initialize SymSpell
sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)

# Load the standard dictionary
sym_spell.load_dictionary("/kaggle/input/sogang-nlp-rag/frequency_dictionary_en_82_765.txt", term_index=0, count_index=1) #이 파일은 sym_spell을 설치한 폴더 내에 있으므로 실행시 Source_Code_Team#8.ipynb와 동일한 폴더에 넣어 주세요.
nlp = spacy.load("en_core_web_md")

def correct_spelling(text):
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=3)
    return suggestions[0].term if suggestions else text

def is_word_in_dictionary(word):
    return word in sym_spell.words

def correct_proper_noun(proper_nouns):
    proper_noun = correct_spelling(proper_nouns)
    if is_word_in_dictionary(proper_noun):
        return proper_noun
    sym_spell.create_dictionary_entry(proper_nouns, 1)
    return proper_nouns

# 텍스트를 NER 형태로 변환하는 함수
def convert_to_ner_format(text):
    doc = nlp(text)
    ner_formatted_text = ""
    last_end = 0

    for ent in doc.ents:
        # 엔터티 전까지의 일반 텍스트 추가
        ner_formatted_text += correct_spelling(text[last_end:ent.start_char]) + ' '
        # 엔터티와 그 태그 추가
        ner_formatted_text += correct_proper_noun(ent.text) + ' '
        last_end = ent.end_char

    # 마지막 엔터티 이후의 일반 텍스트 추가
    ner_formatted_text += correct_spelling(text[last_end:])
    if len(ner_formatted_text)==0:
        ner_formatted_text = text
    elif ner_formatted_text[-1]==' ':
        ner_formatted_text = ner_formatted_text[:-1] + '.\n'
    return ner_formatted_text

# 오타를 수정하는 함수
def correct_spelling(text):
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=3)
    return suggestions[0].term if suggestions else text

In [None]:
def post_processing(sentence):
    sentence = re.sub(r'\n', '', sentence)
    sentence = re.sub(r'[^\w\s]', '', sentence)
    return sentence

In [None]:
submission['sentences'] = submission['sentences'].apply(post_processing)
submission['queries'] = submission['queries'].apply(post_processing)
submission['queries'] = submission['queries'].apply(convert_to_ner_format)

submission.fillna('none answer', inplace=True) # remove null value
submission['id'] = submission['id'].astype('int64') # change id type to int
submission['sentences'] = submission['sentences'].replace(['None', 'NA'], 'none_answer') # replace None and NA to none_answer
submission.head()

### Save Submission File

In [None]:
current_time = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
file_name = f'submission_{current_time}.csv'
submission.to_csv(file_name,index=False,encoding='utf-8-sig')