In [1]:
#@title Initial configuration
!pip install langchain openai elevenlabs > /dev/null 2>&1
!pip install pytesseract > /dev/null 2>&1
!pip install pdf2image > /dev/null 2>&1
!pip install fpdf > /dev/null 2>&1

!pip install unstructured tiktoken chromadb chroma pypdf > /dev/null 2>&1
!apt-get install tesseract-ocr > /dev/null 2>&1
!apt-get install libtesseract-dev > /dev/null 2>&1
!apt-get install poppler-utils > /dev/null 2>&1

# Download Polish language data file
!wget https://github.com/tesseract-ocr/tessdata/raw/main/pol.traineddata > /dev/null 2>&1

# Specify the Tesseract data directory
tessdata_dir = '/usr/share/tesseract-ocr/4.00/tessdata/'

# Move the downloaded language data file to the Tesseract data directory
!mv pol.traineddata $tessdata_dir

# Set TESSDATA_PREFIX environment variable
import os
import elevenlabs
from getpass import getpass

OPENAI_API_KEY = getpass("OPENAI_API_KEY: ")
elevenlabs.set_api_key(getpass("11_LABS_API_KEY: ")) 

os.environ['TESSDATA_PREFIX'] = tessdata_dir
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


OPENAI_API_KEY: ··········
11_LABS_API_KEY: ··········


## **Utils**

In [2]:
# @title Convert .md -> .pdf { display-mode: "form" }
!sudo apt-get install pandoc texlive-latex-base texlive-fonts-recommended texlive-extra-utils texlive-latex-extra > /dev/null 2>&1

def md_to_pdf():
  """Converts md file into pdf."""
  !pandoc /content/sample_data/conversation.md -o /content/sample_data/conversation.pdf


In [3]:
# @title Downloader { display-mode: "form" }
from google.colab import files

def download_conversation(text, file_path="/content/sample_data/conversation"):
  with open(f"{file_path}.md", "w+") as f:
    f.write(text)

  md_to_pdf()
  files.download(f"{file_path}.pdf")


In [4]:
#@title Print Markdown { display-mode: "form" }
from IPython.display import Markdown

def print_md(text=r'#**Hello** *World:* $\frac{1}{2} \times \pi \times r^2$'):
    display(Markdown(text))
print_md()

#**Hello** *World:* $\frac{1}{2} \times \pi \times r^2$

In [5]:
#@title 11Labs { display-mode: "form" }
from elevenlabs import generate, play

def generate_and_play(text="Some very long text to be read by the voice"):
  audio = generate(
      text=text, 
      model='eleven_multilingual_v1'
      )

  play(audio, notebook=True)


## **Agent Tools**

In [26]:
#@title Menu View
MENU_VIEW_ENG = \
"""# Welcome!
## Settings:
 - audio (**on**/off)
 - text formatting (**.md**/.txt)
 - language

## Avaliable programs:
 - compendium - university courses content scraper and personal tutor
 - img2pdf - converter for images to pdfd
"""

MENU_VIEW_PL = \
"""# Witaj
## Ustawienia:
 - audio (**wł** / wył)
 - formatowanie tekstu (**.md** / .txt)
 - język

## Dostępne narzędzia:
 - Compendium -  Wyszukiwarka wiedzy uniwersyteckiej i osobisty tutor
 - im2pdf - Konwerter zdjęć do pdf
"""

In [7]:
#@title ChatConfiguration { display-mode: "form" }
from pydantic import BaseModel, Field
from typing import Literal

from langchain.callbacks import get_openai_callback
from langchain.tools import tool

USE_CASE_LITERAL = Literal["menu", "compendium"]
STATE_AUDIO_LITERAL = Literal["on", "off"]
STATE_TEXT_LITERAL = Literal["md", "txt"]

class ChatConfigurationSingleton:
  __instance = None

  use_case: USE_CASE_LITERAL = "menu"
  audio_state: STATE_AUDIO_LITERAL = "on"
  text_state: STATE_TEXT_LITERAL = "md"

  used_tokens = 0
  total_cost = 0

  def __new__(cls, *args, **kwargs):
    if not cls.__instance:
        cls.__instance = super().__new__(cls, *args, **kwargs)
    return cls.__instance

  def extract_inf(self, openai_cb): # BUG
    self.used_tokens += openai_cb.total_tokens
    self.total_cost += openai_cb.total_cost

  def show(self, text):
    if self.text_state == "md":
      print_md(text)
    else:
      print(text)

    if self.audio_state == "on":
      generate_and_play(text)
      

  def run(self, run_func, text):
    with get_openai_callback() as cb:
      response = run_func(text)
      self.extract_inf(cb) # BUG

    self.show(response)

chat_config = ChatConfigurationSingleton()


@tool(return_direct=True)
def set_use_case_toool(state: USE_CASE_LITERAL) -> str:
  """Useful at the beginning, when user knows what he want
    @param use_case: The use case to be processed.
      "menu" - back to menu
      "compendium" - university courses content scraper and personal tutor
      img2pdf
  """
  chat_config.use_case = state
  return f"Switched to {state}"

@tool(return_direct=True)
def set_audio_tool(state: STATE_AUDIO_LITERAL) -> str:
  """Useful when user want or don't want speaking responses"""
  chat_config.audio_state = state
  return f"From now audio responses are turned {state}"

@tool(return_direct=True)
def set_markdown_tool(state: STATE_TEXT_LITERAL) -> str:
  """Turn off or on markdown formatted responses"""
  chat_config.text_state = state
  return f"From now messages are displayed using {state}"


## **Compendium**

In [8]:
courses_recources = {
    "Fizyka I" : [
      ("pl", "/content/GFG-15.pdf")   
    ],
    "analiza matematyczna": {
        
    }
} 

In [9]:
#@title Resource fetcher
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain import PromptTemplate, LLMChain
import json

def getLanguage(text):
  llm = OpenAI(temperature = 0)

  prompt = PromptTemplate(
          input_variables=["text"],
          template="What is a language of that text: {text}? Answer in one word",
      )

  llm_chain = LLMChain(prompt=prompt, llm=OpenAI())
  return llm_chain.run(text).replace("\n", "").replace(":", "")


def addDataSource(name_of_course: str, pdf_path: str):

  loader = UnstructuredPDFLoader(pdf_path)

  data = loader.load()

  text_splitter = RecursiveCharacterTextSplitter(chunk_size=4000, chunk_overlap=0)
  texts = text_splitter.split_documents(data)

  data_source = (getLanguage(texts[0]), pdf_path)

  try:
    with open("/content/embeddings.json", "r") as file:
      data = json.load(file)
  except FileNotFoundError:
    data = {}

  if name_of_course in data:
      data[name_of_course].append(data_source)
  else:
      data[name_of_course] = [data_source]


  with open("/content/embeddings.json", "w") as outfile:
    json.dump(data, outfile)
    


In [20]:
#@title helpers { display-mode: "form" }
from langchain import PromptTemplate, LLMChain
from langchain.document_loaders import PyPDFLoader
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter

def get_questions(zakres, lang):
  match lang:
    case "pl" : template_questions = "Zadaj pytania dotyczace badz zwiazane z tym zagadnieniem: {zakres}, wylistuj je i zakoncz dzialanie. Pytania:"
    case "eng" : template_questions = "Ask question about or related to topic {zakres}. List them and end task. Questions:" # BUG

  llm_questions = ChatOpenAI(temperature=0.8, max_tokens=1000)
  prompt_questions = PromptTemplate(input_variables=["zakres"], template=template_questions)
  chain_questions = LLMChain(llm=llm_questions,prompt=prompt_questions)
  return chain_questions({"zakres":zakres}, return_only_outputs=True)['text'].split("\n")[:5] # -2


def get_resources(nazwa_kursu):
  lang, url = courses_recources[nazwa_kursu][0]

  match lang:
    case "pl": print_md(f"Użyję {url}")
    case "eng": print_md(f"I will use {url}")

  return lang, url


def get_documents(url):
  loader = PyPDFLoader(url)
  pages = loader.load_and_split()

  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
  return text_splitter.split_documents(pages)

def get_QA(summary, questions_list):
  llm = ChatOpenAI(temperature=0.8,max_tokens=1000, model_name="gpt-3.5-turbo")
  promptTester = PromptTemplate(
      input_variables = ["summary", "QA"],
      template = """Create a test to help memorize and practice the material from {summary} and {QA}. There should be multiple questions
      The format should be:
      [QUESTION]?
      a) answer
      b) answer
      c) answer
      d) answer

      Odpowiedź:
      """,
  )

  testerChain = LLMChain(llm=llm, prompt=promptTester, output_key="test")
  return testerChain({"summary":summary,"QA":questions_list})["test"]


def get_notes(summary, QA):
  llm = OpenAI(temperature=0.8,max_tokens=1000, model_name="gpt-3.5-turbo")
  promptNote = PromptTemplate(
      input_variables = ["sumText","QA" ],
      template = """Connect the information from {sumText} and {QA} into a big note from which students can learn and remove breaklines.""",
  )

  noteChain = LLMChain(llm=llm,prompt=promptNote, output_key="summary")
  return noteChain({"sumText": summary,"QA": QA})["summary"]

In [11]:
#@title Run Compendium
from langchain.llms import OpenAI
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain import VectorDBQA
from langchain.chains import RetrievalQA

def ask(msg, content):
  option = input(msg)
  # content += f"{msg}:{option}"
  return option == "yes" or option ==  "tak" or option ==  "pewnie" or option ==  "sure" 
    
def run_compendium(nazwa_kursu, zakres, url, lang):
  with get_openai_callback() as cb:
    if not (url and lang):
      lang, url = get_resources(nazwa_kursu)
    questions_list = get_questions(zakres, lang)

    for question in questions_list:
      chat_config.show(question)

    document = get_documents(url)
    # print(f'Now you have {len(document)} documents')

    embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
    docsearch = Chroma.from_documents(document, embeddings)
    chain = RetrievalQA.from_chain_type(llm=ChatOpenAI(temperature=0.7,max_tokens=1000) ,chain_type='map_reduce', retriever=docsearch.as_retriever(), return_source_documents=True)

    summary = ""
    for question in questions_list:
      summary += chain({'query':f'{question} Rozwin swoja wypowiedz o jak najwiecej szczegolow'},return_only_outputs=True)['result']
    
    content = f"## Podsumowanie:\n{summary}\n" 
    chat_config.show(summary)
    
    # QA Maker
    if ask("Chcesz sprawdzić swoją wiedzę?:", content):
      QA = get_QA(summary, questions_list)
      content += f"\n## Test wiedzy:\n{QA}\n" 
      chat_config.show(QA)

    # Notes Maker
    if ask("Chcesz notatki?:", content):
      notes = get_notes(summary, QA)
      content += f"\n## Notatki:\n{notes}\n"
      chat_config.show(notes)

    content += f"\n\n*Used {chat_config.used_tokens} tokens, what cost: ${chat_config.total_cost}*"
    # Downloader
    if ask("Chcesz pobrać:", content):
      download_conversation(content)
    
    chat_config.extract_inf(cb)


## **Imgs2PDF**

In [12]:
import pytesseract
from fpdf import FPDF
from pdf2image import convert_from_path

def delete_special_characters(word):
  for character in word:
    if ord(character) > 127:
      return False
  return True
    
    
def convert_images_to_searchable_pdf(pdf_path: str, return_path: str) -> None:
    """Extracts text from an image and saves it as pdf file to user device"""
    images = convert_from_path(pdf_path)
    # save FPDF() class into a variable pdf
    pdf = FPDF()
    for i, image in enumerate(images):
        # Extract text from image
        text = pytesseract.image_to_string(image, lang="pol")
        # Add a page
        pdf.add_page()
        
        # set style and size of font
        # that you want in the pdf
        pdf.set_font("Arial", size=15)
        
        for line in text.split('\n'):
            if delete_special_characters(line):
                pdf.cell(200, 10, txt=line, ln=1, align='C')
        
        # save the pdf with name .pdf
    pdf.output(return_path) 
       

pdf_path = '/content/530_Krotki_wyk_ad_z_fizyki_ogolnej.pdf' #@param {type:"string"}
return_path = '/content/GFG.pdf' #@param {type:"string"}
# convert_images_to_searchable_pdf(pdf_path)

# **Demo**

In [33]:
#@title Initialize Agent
from langchain.agents import AgentType, initialize_agent
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

chat_llm = ChatOpenAI(temperature=0)
tools = [set_use_case_toool, set_audio_tool, set_markdown_tool]
agent = initialize_agent(tools, chat_llm, agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION, memory=memory)

print_md(MENU_VIEW_ENG)
memory.chat_memory.add_ai_message(MENU_VIEW_ENG)


# Welcome!
## Settings:
 - audio (**on**/off)
 - text formatting (**.md**/.txt)
 - language

## Avaliable programs:
 - compendium - university courses content scraper and personal tutor
 - img2pdf - converter for images to pdfd


In [34]:
#@title Conversation

match chat_config.use_case:
  case "menu":
    #@markdown **Menu**
    User_input = "switch to compendium" #@param {type:"string"}
    chat_config.run(agent.run, User_input)

  case "compendium":
    #@markdown **Compendium:**
    nazwa_kursu = "-" #@param ["Fizyka I", "Analiza matematyczna", "-"]
    zakres = "Boundless Distributed Alignment Search method and its effectiveness" #@param {type:"string"}
    literatura = "https://arxiv.org/pdf/2305.08809.pdf" #@param {type:"string"}
    jezyk_literatury = "eng" #@param {type:"string"}

    run_compendium(nazwa_kursu, zakres, literatura, jezyk_literatury)
    chat_config.use_case = "menu" # powrot do menu

  case "img2pdf":
    #@markdown **ImgPDF:**
    pdf_path = '' #@param {type:"string"}
    return_path = '' #@param {type:"string"}

    convert_images_to_searchable_pdf(pdf_path, return_path)
    chat_config.use_case = "menu" # powrot do menu



1. What is the Boundless Distributed Alignment Search method and how does it work?

2. How does the Boundless Distributed Alignment Search method compare to other existing alignment search methods in terms of effectiveness and efficiency?

3. What are the advantages and disadvantages of using the Boundless Distributed Alignment Search method?

4. Can the Boundless Distributed Alignment Search method be used for large-scale genome sequencing projects? 

5. How does the Boundless Distributed Alignment Search method handle missing or incomplete data?



The Boundless Distributed Alignment Search (DAS) is a method used to align interpretable causal variables of a model C and fixed dimensionality linear subspaces of neural representations in a network N using gradient descent. The method works by rotating the neural representations in the network N with an orthogonal matrix such that they align with the interpretable causal variables of the model C. This alignment is learned using gradient descent. By aligning the neural representations with the model variables, DAS allows for a better understanding of the relationship between the model and the neural network. 

DAS is a powerful method for aligning neural network representations with interpretable causal variables, allowing for better understanding and interpretation of the model. It is a technique used to scale the alignment search of causal structure in large language models (LLMs) with billions of parameters. It is a novel and effective method that can identify the ideal set of hypotheses and search optimally for causal structure. The method works by utilizing a distributed architecture that allows for parallel processing across multiple machines, enabling efficient and scalable alignment search. It can find causal structure even when the model's task performance is low, due to factors such as suboptimal generation methods or rigid assessment metrics.

In the field of natural language processing (NLP), DAS is used to align word embeddings with high-level concepts. Unlike previous methods that assumed a one-to-one mapping between a group of neurons and a high-level concept, Boundless DAS focuses on distributed alignment search, allowing multiple concepts to be aligned with the same group of neurons or vice versa. What sets Boundless DAS apart is its ability to be adapted into a head-wise alignment search, where a shared rotation matrix is added on top of the head representations of all the tokens. This method is highly effective in aligning word embeddings with high-level concepts and has been implemented in various NLP models.

In summary, DAS is a method for aligning neural network representations with interpretable causal variables, allowing for better understanding and interpretation of the model. It can be used to scale the alignment search of causal structure in large language models with billions of parameters and align word embeddings with high-level concepts. The Boundless DAS method represents an important step forward in the development of interpretable AI methods, which are critical for ensuring AI safety and advancing our understanding of complex models.Unfortunately, the given portion of the document does not provide any explicit information about how the Boundless Distributed Alignment Search method compares to other existing alignment search methods in terms of effectiveness and efficiency. The text only mentions that Boundless DAS is a novel method for scaling alignment search of causal structure in large-scale learning models. It is capable of finding structure even when the model's task performance is low, and it can adapt to a head-wise alignment search. 

However, the text does not provide any specific information on the efficiency of Boundless DAS in comparison to other methods. It only highlights its unique features and the fact that it can find interpretable causal structure in the Alpaca model. It is possible that further information about the comparison of Boundless DAS with other methods is provided in other sections of the document or in external sources. 

In summary, based on the given portion of the document, it is not possible to provide a detailed answer about how the Boundless Distributed Alignment Search method compares to other existing alignment search methods in terms of effectiveness and efficiency.According to the extracted parts of the document, the Boundless Distributed Alignment Search (DAS) method has several advantages. One of the most significant advantages is its effectiveness in scaling alignment search of causal structure in Large Language Models (LLMs) to billions of parameters. This is a useful feature, as it allows the method to find causal structure even in cases where the model's task performance is low, making it possible to identify suboptimal generation methods or rigid assessment metrics. 

Another advantage of the Boundless DAS method is that it can be used to identify ideal sets of hypotheses and search optimally. This is because it breaks the assumption of localist representation, which is often too ideal in practice. Additionally, it specifically focuses on distributed alignment search, making it a powerful tool for analyzing LLMs and identifying causal structures.

However, there are also some potential disadvantages to using the Boundless DAS method. For example, it may only find partial support for causal structure, as seen in the results of Geiger et al. Furthermore, more research needs to be done to model errors of the language model in more detail to tighten the connection between task performance and the identification of causal structure.

In summary, the Boundless DAS method has several advantages, including its scalability and ability to identify ideal sets of hypotheses and search optimally. However, it also has some potential disadvantages, such as only finding partial support for causal structure and requiring more research to model errors of the language model in more detail.Unfortunately, the given portion of the document does not provide any information about whether the Boundless Distributed Alignment Search method can be used for large-scale genome sequencing projects or not. The text only focuses on the effectiveness of this method in scaling alignment search of causal structure in language models to billions of parameters and its potential in improving interpretability tools developed for language models. It also outlines the ability of Boundless DAS to find structure even when the model's task performance is low.

Therefore, it is not possible to provide a definitive answer to this question based on the given text. There may be other parts of the document or other sources of information that could shed light on the applicability of this method to genome sequencing projects, but they are not included in the given portion of the text.Unfortunately, based on the provided portions of the document, it is not possible to determine how the Boundless Distributed Alignment Search method handles missing or incomplete data. The given sections only explain the general approach and effectiveness of the method in scaling alignment search and finding structure in language models. However, it is mentioned that future work needs to model errors of the language model in more detail, which could potentially include addressing missing or incomplete data. More information or context would be needed to provide a comprehensive answer to this question.

Chcesz sprawdzić swoją wiedzę?:sure


1. What is the Boundless Distributed Alignment Search method and how does it work?
a) The Boundless Distributed Alignment Search method is a technique used to align interpretable causal variables of a model C and fixed dimensionality linear subspaces of neural representations in a network N using gradient descent. It works by rotating neural representations in the network N with an orthogonal matrix to align with the interpretable causal variables of the model C.

2. How does the Boundless Distributed Alignment Search method compare to other existing alignment search methods in terms of effectiveness and efficiency?
a) Based on the given portion of the document, it is not possible to provide a detailed answer about how the Boundless Distributed Alignment Search method compares to other existing alignment search methods in terms of effectiveness and efficiency.

3. What are the advantages and disadvantages of using the Boundless Distributed Alignment Search method?
a) The advantages of using the Boundless Distributed Alignment Search method include its scalability, effectiveness in scaling alignment search of causal structure in Large Language Models (LLMs) to billions of parameters, and ability to identify ideal sets of hypotheses and search optimally. The disadvantages include only finding partial support for causal structure and requiring further research to model errors of the language model in more detail.
 
4. Can the Boundless Distributed Alignment Search method be used for large-scale genome sequencing projects?
a) Unfortunately, the given portion of the document does not provide any information about whether the Boundless Distributed Alignment Search method can be used for large-scale genome sequencing projects or not.

5. How does the Boundless Distributed Alignment Search method handle missing or incomplete data?
a) Based on the provided portions of the document, it is not possible to determine how the Boundless Distributed Alignment Search method handles missing or incomplete data. More information or context would be needed to provide a comprehensive answer to this question.

Chcesz notatki?:yes




The Boundless Distributed Alignment Search (DAS) method is a powerful tool for aligning neural network representations with interpretable causal variables, allowing for better understanding and interpretation of the model. This technique works by rotating the neural representations in a network N with an orthogonal matrix to align with the interpretable causal variables of the model C using gradient descent.

While it is not possible to determine how the Boundless DAS method compares to other existing alignment search methods in terms of effectiveness and efficiency based on the given portion of the document, it has several advantages and disadvantages. The Boundless DAS method can scale alignment search of causal structure in Large Language Models (LLMs) to billions of parameters, identify ideal sets of hypotheses, and search optimally. However, it may only find partial support for causal structure and requires further research to model errors of the language model in more detail.

Furthermore, it is not possible to determine whether the Boundless Distributed Alignment Search method can be used for large-scale genome sequencing projects or how it handles missing or incomplete data based on the given portion of the document. However, the Boundless DAS method has been successfully used to align word embeddings with high-level concepts in Natural Language Processing (NLP) models.

In summary, the Boundless Distributed Alignment Search method is a useful tool for aligning neural network representations with interpretable causal variables, allowing for better understanding and interpretation of the model. However, more information or context may be needed to provide a comprehensive answer to specific questions about its comparison to other methods, applicability to genome sequencing projects, and handling of missing or incomplete data.

Chcesz pobrać:yes


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [35]:
# @title OpenAI api usage
f"Used {chat_config.used_tokens} tokens, what cost: ${chat_config.total_cost}"

'Used 14997 tokens, what cost: $0.029994'