In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
import os
from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import math
import json
import numpy as np
import pickle
import nltk
from nltk.corpus import stopwords
from deep_translator import GoogleTranslator

In [None]:
with open('/home/martin/openaiKey.txt', 'r') as file:
    key_content = file.read()

OPENAI_API_KEY = str(key_content.strip())

In [None]:
with open('contexts_embeddings.pkl', 'rb') as f:
    embeddings = pickle.load(f)
    
with open('contexts.pkl', 'rb') as f:
    contexts = pickle.load(f)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

feature_extraction_pipeline = pipeline('feature-extraction', model=model, tokenizer=tokenizer)  # 'device=0' for GPU, remove for CPU

In [None]:
# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))
# def preprocess(sentence):
    
#     #SOME OTHER PREPROCSESING
    
#     #REMOVE STOPWORDS
#     filtered_tokens = [word for word in sentence.split() if word.lower() not in stop_words]
#     return ' '.join(filtered_tokens)


#PREPROCESSING FOR CONTEXT AND QUESTION EMBEDDINGS SHOULD PROBABLY BE THE SAME  

In [None]:
#Given a prompt in czech, translate into english and embed using a previously defined pipeline
def get_embedding(prompt):
    chunk_size = 200 
    
    #Tranlsate
    prompt = GoogleTranslator(source='cs', target='en').translate(prompt)
    # Split the text into non-overlapping chunks
    chunks = [prompt[i:i + chunk_size] for i in range(0, len(prompt), chunk_size)]
#     print(prompt)
    # Initialize an empty list to store chunk embeddings
    chunk_embeddings = []

    for chunk in chunks:
        chunk_embedding = feature_extraction_pipeline(chunk)
        chunk_embedding = np.mean(chunk_embedding[0], axis=0)
        chunk_embeddings.append(chunk_embedding)

    prompt_embedding = np.mean(chunk_embeddings, axis=0)
    prompt_embedding = np.array(prompt_embedding).reshape(1, -1)
    return prompt_embedding

In [None]:
#Give contexts and prompt embeddings, find most appropriate context for the prompt.
def get_class(embeddings, prompt_embedding):
    prompt_class = None
    max_sim = -1
    for i, (emb_name, emb_t) in enumerate(embeddings.items()):
        sim = cosine_similarity(prompt_embedding, emb_t)
        print(sim)
        if sim > max_sim:
            prompt_class = emb_name
            max_sim = sim
    return prompt_class

In [None]:
llm = ChatOpenAI(temperature=0.75, model_name='ft:gpt-3.5-turbo-1106:personal::8RjGTQTI', openai_api_key = OPENAI_API_KEY)

In [None]:
from langchain.prompts.prompt import PromptTemplate

#template of a prompt
template = """Napodobuješ Českého premiéra Petra Fialu. Odpovídej na prompty, jako by si byl on. Odpovídej česky a v několika větách. 

Current conversation:
{history}
Human: {input} 
Petr Fiala:"""
print(template)

PROMPT = PromptTemplate(input_variables=["history", "input"], template=template)
conversation = ConversationChain(
    prompt=PROMPT,
    llm=llm,
    verbose=True,
    memory=ConversationBufferMemory(ai_prefix="Petr Fiala"),
)


In [None]:
#Find proper context and add it to the prompt.
prompt = "Co si myslíš o Robertu Ficovi?"
prompt_emb = get_embedding(prompt)
prompt_class = get_class(embeddings, prompt_emb)
formatted_prompt = f"{prompt}. Context: {contexts[prompt_class]}"

In [None]:
#run the prompt into the model
conversation.predict(input=formatted_prompt)

In [None]:
prompt2 = "Co si myslíš o Robertu Ficovi?"

#CO TED S KONTEXTEM? CO DAT JAKO INPUT? PRVNI KONTEXT JE ULOZENY V HISTORII, 
#TAKZE MOZNA VYBIRAT JEN ZA OSTATNICH + THRESHOLD??

# prompt_emb = get_embedding(prompt)
# prompt_class = get_class(embeddings, prompt_emb)
# formatted_prompt = f"{prompt}. Context: {contexts[prompt_class]}"