# **LLMs in 4 acts with Luis Roque**

# **Act 0:** Dependencies and environment setup (boring)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install spotipy
!pip install langchain
!pip install openai
!pip install transformers
!pip install sentence-transformers
!pip install faiss-cpu
!pip install unstructured
!pip install chromadb
!pip install datasets

Collecting spotipy
  Downloading spotipy-2.23.0-py3-none-any.whl (29 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-5.0.1-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.3/250.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: redis, spotipy
Successfully installed redis-5.0.1 spotipy-2.23.0
Collecting langchain
  Downloading langchain-0.0.340-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.2-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langsmith<0.1.0,>=0.0.63 (from langchain)
  Downloading langsmith-0.0.66-py3-none-any.whl (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.8/46.8 kB[0m [31

In [3]:
import os
import json
import time
from typing import Dict, List, Any, Tuple
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains.question_answering import load_qa_chain
from transformers import GPT2Tokenizer
import openai
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import pandas as pd
import random
import pickle
import os
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
from datasets import load_dataset

random.seed(42)

In [4]:
work_path = "/content/drive/MyDrive/Speaker/DSPT23_LLMsUnfolded"
env_file_path = os.path.join(work_path, '.env/env_variables.pkl')

def load_secrets(file_path):
    """
    Load secrets from a pickle file.

    Args:
        file_path (str): Path to the pickle file from which secrets will be loaded.

    Returns:
        dict: A dictionary containing the loaded secrets.
    """
    if os.path.exists(file_path):
        with open(file_path, 'rb') as file:
            return pickle.load(file)
    return {}

# Example secrets
secrets = {
    'client_id': 'a088f7',
    'client_secret': '193b350fd',
    'OPENAI_API_KEY': 's8oY4QHhhW',
    'HF_TOKEN': 'hCGdYekDcZLJi'
}

secrets = load_secrets(env_file_path)

# **Act 1:** Summarization, AI features and ChatGPT Wrappers

![ChatGPT Wrapper Image](https://drive.google.com/uc?export=view&id=1Gf9mlEOfcjwR2dwBgMTWHt7AmsMDCYH0)

In [5]:
credentials_manager = SpotifyClientCredentials(client_id=secrets['client_id'], client_secret=secrets['client_secret'])
sp = spotipy.Spotify(client_credentials_manager=credentials_manager)


def get_new_releases(limit: int = 50, offset: int = 0) -> List[Dict[str, Any]]:
    """
    Fetch new releases from Spotify.

    Args:
        limit (int, optional): Maximum number of album results to return. Defaults to 50.
        offset (int, optional): The index of the first result to return. Defaults to 0.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries containing album information.
    """
    new_releases = sp.new_releases(limit=limit, offset=offset)
    albums = new_releases["albums"]["items"]
    return albums


def get_album_tracks(album_id: str) -> List[Dict[str, Any]]:
    """
    Fetch tracks from a specific album.

    Args:
        album_id (str): The Spotify ID of the album.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries containing track information.
    """
    tracks = sp.album_tracks(album_id)["items"]
    return tracks


def save_data_to_file(data: List[Dict[str, Any]], file_path: str) -> None:
    """
    Save data to a JSON file.

    Args:
        data (List[Dict[str, Any]]): List of dictionaries containing album and track information.
        file_path (str): Path to the JSON file where the data will be saved.
    """
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(data, file, ensure_ascii=False, indent=4)


def load_data_from_file(file_path: str) -> List[Dict[str, Any]]:
    """
    Load data from a JSON file.

    Args:
        file_path (str): Path to the JSON file where the data is stored.

    Returns:
        List[Dict[str, Any]]: List of dictionaries containing album and track information.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)


def download_latest_albums_data() -> None:
    """
    Download the latest albums and tracks data from Spotify and save it to a JSON file.
    """
    limit = 50
    offset = 0
    total_albums = 20
    album_count = 0

    all_albums = []

    while total_albums is None or album_count < total_albums:
        new_releases = get_new_releases(limit, offset)
        if total_albums is None:
            total_albums = sp.new_releases()["albums"]["total"]

        for album in new_releases:
            album_info = {
                "album_name": album["name"],
                "artist_name": album["artists"][0]["name"],
                "album_type": album["album_type"],
                "release_date": album["release_date"],
                "tracks": [],
            }

            tracks = get_album_tracks(album["id"])

            for track in tracks:
                track_info = {
                    "track_name": track["name"],
                    "duration_ms": track["duration_ms"],
                }
                album_info["tracks"].append(track_info)

            all_albums.append(album_info)
            album_count += 1

        offset += limit
        time.sleep(1)  # Add a delay to avoid hitting the rate limit
        print(f"Downloaded {album_count}/{total_albums}")

    save_data_to_file(all_albums, "albums_and_tracks.json")


In [6]:
data = load_data_from_file(os.path.join(work_path, 'albums_and_tracks.json'))

In [7]:
data[:2]

[{'album_name': 'Welcome 2 Collegrove',
  'artist_name': '2 Chainz',
  'album_type': 'album',
  'release_date': '2023-11-17',
  'tracks': [{'track_name': 'Scene 1: Welcome 2 Collegrove',
    'duration_ms': 49154},
   {'track_name': 'G6', 'duration_ms': 184572},
   {'track_name': 'Big Diamonds (feat. 21 Savage)', 'duration_ms': 199597},
   {'track_name': 'Presha', 'duration_ms': 185058},
   {'track_name': 'Long Story Short', 'duration_ms': 210338},
   {'track_name': 'Scene 2: Duffle Bag Boys', 'duration_ms': 29250},
   {'track_name': 'Millions From Now', 'duration_ms': 139797},
   {'track_name': 'Crazy Thick', 'duration_ms': 176938},
   {'track_name': 'Transparency (feat. USHER)', 'duration_ms': 195094},
   {'track_name': 'Significant Other', 'duration_ms': 263668},
   {'track_name': 'Scene 3: Ladies Man', 'duration_ms': 31541},
   {'track_name': 'P.P.A. (feat. Fabolous)', 'duration_ms': 220586},
   {'track_name': 'Oprah & Gayle (feat. Benny The Butcher)',
    'duration_ms': 313397},
  

In [8]:
def preprocess_docs(data: List[Dict[str, Any]]) -> str:
    """
    Convert the JSON data to a list of Document objects.

    Args:
        data (List[Dict[str, Any]]): List of dictionaries containing album and track information.

    Returns:
        List[Document]: A list of Document objects containing the JSON data as strings, split into 3000-character segments.
    """
    json_string = json.dumps(data, ensure_ascii=False, indent=0)
    return json_string

In [9]:
doc = preprocess_docs(data)
doc

'[\n{\n"album_name": "Welcome 2 Collegrove",\n"artist_name": "2 Chainz",\n"album_type": "album",\n"release_date": "2023-11-17",\n"tracks": [\n{\n"track_name": "Scene 1: Welcome 2 Collegrove",\n"duration_ms": 49154\n},\n{\n"track_name": "G6",\n"duration_ms": 184572\n},\n{\n"track_name": "Big Diamonds (feat. 21 Savage)",\n"duration_ms": 199597\n},\n{\n"track_name": "Presha",\n"duration_ms": 185058\n},\n{\n"track_name": "Long Story Short",\n"duration_ms": 210338\n},\n{\n"track_name": "Scene 2: Duffle Bag Boys",\n"duration_ms": 29250\n},\n{\n"track_name": "Millions From Now",\n"duration_ms": 139797\n},\n{\n"track_name": "Crazy Thick",\n"duration_ms": 176938\n},\n{\n"track_name": "Transparency (feat. USHER)",\n"duration_ms": 195094\n},\n{\n"track_name": "Significant Other",\n"duration_ms": 263668\n},\n{\n"track_name": "Scene 3: Ladies Man",\n"duration_ms": 31541\n},\n{\n"track_name": "P.P.A. (feat. Fabolous)",\n"duration_ms": 220586\n},\n{\n"track_name": "Oprah & Gayle (feat. Benny The Butc

In [10]:
# get the number of tokens

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokens = tokenizer.tokenize(doc)
num_tokens = len(tokens)

print(f"Number of chars: {len(doc)}")
print(f"Number of tokens: {num_tokens}")
print(f"Number of chars/token: {len(doc)/num_tokens}")
print(f"Tokens: {tokens}")

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Number of chars: 34279
Number of tokens: 13967
Number of chars/token: 2.454285100594258
Tokens: ['[', 'Ċ', '{', 'Ċ', '"', 'album', '_', 'name', '":', 'Ġ"', 'Welcome', 'Ġ2', 'ĠColleg', 'ro', 've', '",', 'Ċ', '"', 'artist', '_', 'name', '":', 'Ġ"', '2', 'ĠChain', 'z', '",', 'Ċ', '"', 'album', '_', 'type', '":', 'Ġ"', 'album', '",', 'Ċ', '"', 'release', '_', 'date', '":', 'Ġ"', '20', '23', '-', '11', '-', '17', '",', 'Ċ', '"', 'tracks', '":', 'Ġ[', 'Ċ', '{', 'Ċ', '"', 'track', '_', 'name', '":', 'Ġ"', 'Scene', 'Ġ1', ':', 'ĠWelcome', 'Ġ2', 'ĠColleg', 'ro', 've', '",', 'Ċ', '"', 'duration', '_', 'ms', '":', 'Ġ49', '154', 'Ċ', '},', 'Ċ', '{', 'Ċ', '"', 'track', '_', 'name', '":', 'Ġ"', 'G', '6', '",', 'Ċ', '"', 'duration', '_', 'ms', '":', 'Ġ18', '45', '72', 'Ċ', '},', 'Ċ', '{', 'Ċ', '"', 'track', '_', 'name', '":', 'Ġ"', 'Big', 'ĠDiamond', 's', 'Ġ(', 'feat', '.', 'Ġ21', 'ĠSavage', ')",', 'Ċ', '"', 'duration', '_', 'ms', '":', 'Ġ199', '597', 'Ċ', '},', 'Ċ', '{', 'Ċ', '"', 'track', '_', 'name

The tokens you're seeing may seem "weird" or unexpected if you're not familiar with how subword tokenization works, especially in models like GPT-2, GPT-3, and GPT-4. Here's an explanation of what's happening:
Subword Tokenization

    Subword Units: Words are broken into smaller parts, allowing the model to efficiently handle a wide vocabulary, including rare or new words.
    Special Tokens: The GPT tokenizer uses special characters (like Ġ) to denote spaces or other elements. For instance, Ġ indicates a space before a word.

Examples from Your Output

In our output, words like 'album', 'name', 'Love' are whole words, while characters like '[', '{', '_'` are tokenized separately as punctuation or special characters.

Why Use Subword Tokenization?

    Efficiency: It allows the model to efficiently process a wide range of words, including those not seen during training.
    Flexibility: It can handle different languages and novel word combinations.
    Reduced Vocabulary Size: Instead of needing a separate token for every possible word, the tokenizer can combine tokens to form new words, which is more memory-efficient.

In [11]:
file_name = "summary_spotify.txt"

def get_summary(json_data, file_path):
    """
    Generate a summary using the JSON data provided, or retrieve it from a file if already generated.

    Args:
        json_data (str): A string containing the JSON data about the latest releases on Spotify.
        file_path (str): Path to the file where the summary is stored.

    Returns:
        str: The generated summary.
    """
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            return file.read()

    client = openai.OpenAI(api_key=secrets['OPENAI_API_KEY'])
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": f"Write a summary about the latest releases in Spotify based on the JSON data below: \n\n{json_data}"
            }
        ],
        model="gpt-3.5-turbo-1106",
    )

    summary = chat_completion.choices[0].message.content

    with open(file_path, 'w') as file:
        file.write(summary)

    return summary

In [12]:
summary = get_summary(doc, os.path.join(work_path, file_name))

In [13]:
summary

'The latest releases in Spotify include a wide variety of albums and singles from artists such as Drake, 2 Chainz, Tate McRae, Ozuna, André 3000, Olivia Rodrigo, Ari Lennox, Troye Sivan, Lil Durk, Dolly Parton, Wyatt Flores, Arcángel, Daddy Yankee, John Vincent III, Asake, ENHYPEN, Eliza Rose, Yahritza Y Su Esencia, Queen Naija, Shygirl, Dua Lipa, Daft Punk, Timbaland, Laufey, Natalie Jane, Steve Aoki, NCT DREAM, Teni, Brittany Howard, Danny Brown, Frost Children, Lauren Watkins, Tom Odell, The Smile, Leslie Odom Jr., BabyDrill, Julia Michaels, and others. The albums and singles cover various genres and offer a wide range of musical experiences for listeners to enjoy.\n'

# **Act 2:** Semantic Search is the only way to search

![Semantic Search](https://drive.google.com/uc?export=view&id=1p_Xyv7Dp_zAsMX8cyHm1for48VmQedcw)

![Index Partitioning](https://drive.google.com/uc?export=view&id=1zIlxncONNe5mdTMnkDYmw7-tc8RugyWF)

![Index Partitioning](https://drive.google.com/uc?export=view&id=1SHOnKS-paQVz9Bfavc1sUIQM1XqiSvsT)

In [14]:
class ScalableSemanticSearch:
    """A class for vector similarity using product quantization with sentence transformer embeddings and cosine similarity."""

    def __init__(self, device="cpu"):
        """
        Initializes the search model with the specified device.

        Args:
            device (str): The device (e.g., 'cpu' or 'gpu') for running the model.
        """
        self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device=device)
        self.dimension = self.model.get_sentence_embedding_dimension()
        self.index = None

    def encode(self, data: List[str]) -> np.ndarray:
        """
        Encodes a list of sentences into embeddings using the sentence transformer model.

        Args:
            data (List[str]): A list of sentences to be encoded.

        Returns:
            np.ndarray: An array of sentence embeddings.
        """
        embeddings = self.model.encode(data).astype("float32")
        return embeddings

    def store_embeddings(self, embeddings: np.ndarray, file_path: str) -> None:
        """
        Stores embeddings in a file.

        Args:
            embeddings (np.ndarray): The embeddings to store.
            file_path (str): Path to the file where embeddings will be stored.
        """
        with open(file_path, 'wb') as file:
            pickle.dump(embeddings, file)

    def load_embeddings(self, file_path: str) -> np.ndarray:
        """
        Loads embeddings from a file.

        Args:
            file_path (str): Path to the file from which embeddings will be loaded.

        Returns:
            np.ndarray: Loaded embeddings.
        """
        with open(file_path, 'rb') as file:
            return pickle.load(file)

    def get_or_compute_embeddings(self, data: List[str], file_path: str) -> np.ndarray:
        """
        Gets embeddings from a file if available, otherwise computes and stores them.

        Args:
            data (List[str]): Data to encode if embeddings are not already stored.
            file_path (str): File path to store or load the embeddings.

        Returns:
            np.ndarray: The embeddings.
        """
        if os.path.exists(file_path):
            return self.load_embeddings(file_path)

        embeddings = self.encode(data)
        self.store_embeddings(embeddings, file_path)
        return embeddings

    def build_index(self, embeddings: np.ndarray) -> None:
        """
        Builds a FAISS index for efficient similarity search with the given embeddings.

        Args:
            embeddings (np.ndarray): An array of sentence embeddings.
        """
        n_data_points = len(embeddings)
        if n_data_points < 1500:
            self.index = faiss.IndexFlatL2(self.dimension)
            print("Using Flat L2 Index for similarity search.")
        else:
            self.index = self._create_ivfpq_index(embeddings, n_data_points)
            print("Using IVF PQ Index for similarity search.")

        if isinstance(self.index, faiss.IndexIVFPQ):
            self.index.train(embeddings)
        self.index.add(embeddings)

    def _create_ivfpq_index(self, embeddings: np.ndarray, n_data_points: int) -> faiss.IndexIVFPQ:
        """
        Creates an IVF PQ index for efficient similarity search in large datasets.

        Args:
            embeddings (np.ndarray): An array of sentence embeddings.
            n_data_points (int): The number of data points (embeddings).

        Returns:
            faiss.IndexIVFPQ: A product quantization index.
        """
        quantizer = faiss.IndexFlatL2(self.dimension)
        n_clusters = max(2, min(n_data_points, int(np.sqrt(n_data_points))))
        n_bits = 8 # number of product quantization bits which affects the number of centroids
        bytes_per_vector = 4 # number of bytes per vector in the quantized representation
        index = faiss.IndexIVFPQ(quantizer, self.dimension, n_clusters, 8, 4)
        return index

    def search(self, input_sentence: str, top: int) -> Tuple[np.ndarray, np.ndarray]:
        """
        Searches the index for sentences most similar to the input sentence.

        Args:
            input_sentence (str): The sentence to search for.
            top (int): The number of top similar sentences to return.

        Returns:
            Tuple[np.ndarray, np.ndarray]: Indices and distances of the top similar sentences.
        """
        query_vec = self.model.encode([input_sentence], device=self.model.device).astype("float32")
        D, I = self.index.search(query_vec, top)
        return I[0], D[0]

    def save_index(self, file_path: str) -> None:
        """
        Saves the built FAISS index to disk.

        Args:
            file_path (str): Path where the index will be saved.

        Raises:
            AttributeError: If the index has not been built yet.
        """
        if self.index:
            faiss.write_index(self.index, file_path)
        else:
            raise AttributeError("Index not built yet. Use `build_index` first.")

    def load_index(self, file_path: str) -> None:
        """
        Loads a FAISS index from a file.

        Args:
            file_path (str): Path to the file containing the saved index.

        Raises:
            FileNotFoundError: If the file does not exist.
        """
        if os.path.exists(file_path):
            self.index = faiss.read_index(file_path)
        else:
            raise FileNotFoundError(f"No file at '{file_path}'.")

In [15]:
file_name = "GenericsKB-Best.tsv"
index_path_template = "./index_{}.index"

In [16]:
data_frame = pd.read_csv(os.path.join(work_path, file_name), sep="\t")

In [17]:
data_frame

Unnamed: 0,SOURCE,TERM,QUANTIFIER,GENERIC SENTENCE,SCORE
0,Waterloo,aa battery,,AA batteries maintain the settings if the powe...,0.350923
1,ARC,aardvark female,,Aardvark females appear to come into season on...,0.570737
2,ARC,aardvark hole,,Aardvark holes are used by small buck as a res...,0.574909
3,Waterloo,aardvark skin,,Aardvark skin is thick and sparsely haired.,0.444273
4,WordNet3.0,aardvark,,Aardvark isa mammal.,1.000000
...,...,...,...,...,...
1020863,ARC,zygotic meiosis,,Zygotic meiosis occurs in bi flagellates and s...,0.780073
1020864,ARC,zygotic meiosis,,Zygotic meiosis takes place in haploid protist...,0.778516
1020865,Waterloo,zygotic transcription,,Zygotic transcription is required to block a m...,0.759783
1020866,Waterloo,zyplar soil,,Zyplar soils are on pediments.,0.607443


In [18]:
sentences = data_frame['GENERIC SENTENCE'].tolist()

In [19]:
random.seed(42)

print('Large Subset:')
print(len(sentences))
sentences_subset = random.sample(sentences, 2000)
print(len(sentences_subset))

print('\nSmall Subset:')
print(len(sentences))
sentences_subset_small = random.sample(sentences, 1200)
print(len(sentences_subset_small))

Large Subset:
1020868
2000

Small Subset:
1020868
1200


In [20]:
sentences_subset_small[:50]

['A swamp sparrow is a finch',
 'Ceftriaxones are drugs.',
 'Capital expenditures relate to acquisition of general fixed assets.',
 'Meerkats are also viviparous which means the embryo develops in the mother.',
 'Some disabilities can be self-certifying, such as blindness or use of a mobility aid or wheelchair.',
 'Skunks are very adaptable animals, often living in close proximity to man.',
 'Cloning is a technique, an instrument.',
 'Brains are such an under utilized part of the human anatomy.',
 'Physical therapy is important to maintain flexibility and mobility.',
 'Tax competition forces governments to cater to the needs of taxpayers in the global economy.',
 'Ammonia is one of the most efficient greenhouse gases.',
 'Most cars have accelerators.',
 'Agricultural runoff pollutes surface and groundwater.',
 'A fruitwood is wood',
 'Ships transport people.',
 'Sheep are naturally seasonal breeders, and respond to decreasing daylight hours.',
 'Depression is the most common cause of p

In [21]:
sentences_subset[:50]

['Many people have diabetes without realizing.',
 'Breast cancer is abnormal cell growth that originates in the breast tissue.',
 'American states are states.',
 'Requirements management is a systematic approach to finding, documenting, and managing requirements.',
 'Eating is the result or consequence of faith.',
 'Some dietary fat is vital to enable the body to function properly.',
 'A cybersex is sexual arousal',
 'Cats close mouths.',
 'Relations are general graph structures, the most common type being a simple doubly linked list.',
 'Bluetongue is an important disease of economic consequence.',
 'Some plants perform best when given more of a certain color light.',
 'Reptiles can also live on land because their leg positioning allows for more support.',
 'Most traits exist for the benefit of the individual rather than for the good of the species.',
 'Some meteorites are nearly pure stainless steel, born in ancient supernovas.',
 'Biodiversity includes genetic diversity, species div

In [22]:
# Small dataset
search_instance_small = ScalableSemanticSearch()

# Encode Sentences
embeddings_file_small = 'embeddings_small.pkl'
embeddings_small = search_instance_small.get_or_compute_embeddings(sentences_subset_small, os.path.join(work_path, embeddings_file_small))

# Large dataset
search_instance = ScalableSemanticSearch()

# Encode Sentences
embeddings_file = 'embeddings.pkl'
embeddings = search_instance.get_or_compute_embeddings(sentences_subset, os.path.join(work_path, embeddings_file))

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [23]:
embeddings_small.shape

(1200, 768)

In [24]:
sentences_subset_small[0]

'A swamp sparrow is a finch'

In [25]:
embeddings_small[:,0]

array([-0.01366341, -0.00556163,  0.04676014, ..., -0.01325206,
       -0.0152539 ,  0.01575771], dtype=float32)

In [26]:
# Build the Index
search_instance_small.build_index(embeddings_small)
search_instance.build_index(embeddings)

Using Flat L2 Index for similarity search.
Using IVF PQ Index for similarity search.


In [27]:
# L2 Flat
query = "Most cars have accelerators."
top_k = 5
indices, sim_dist = search_instance_small.search(query, top_k)

for i, idx in enumerate(indices):
    print(f"{sim_dist[i]:.2f}: ",sentences_subset_small[idx])

0.00:  Most cars have accelerators.
1.05:  Autos have (part) bumpers.
1.21:  Motorbikes have (part) rear light.
1.22:  A shunter is an engine
1.24:  Vehicle collisions are a major cause of accidents as farmers transport their equipment and grain.


In [28]:
# IVF PQ
query = "Some birds kill snakes."
top_k = 5
indices, sim_dist = search_instance.search(query, top_k)

for i, idx in enumerate(indices):
    print(f"{sim_dist[i]:.2f}: ",sentences_subset[idx])

0.55:  Some birds kill snakes.
0.61:  Birds can choke on peanut butter.
0.62:  Andean shamans fly with condors.
0.63:  Birds lift wings.
0.63:  Porcupines also feed on shed antlers and the bones of dead animals to obtain sodium.


In [29]:
# IVF PQ
query = "Birds have wings and can fly over the ocean."
top_k = 5
indices, sim_dist = search_instance.search(query, top_k)

for i, idx in enumerate(indices):
    print(f"{sim_dist[i]:.2f}: ",sentences_subset[idx])

0.62:  Pelicans return to water.
0.66:  Most ducks enter waterways.
0.72:  Cranes require side-river channels for their nocturnal roosts.
0.72:  Estuaries are among the most productive parts of the marine ecosystem.
0.73:  Penguins are excellent swimmers.


# **Act 3:** Agents is the future of AI

![Vector DBs](https://drive.google.com/uc?export=view&id=12Q7UcOh0gvLvnBpk6v6fLOutl1VR4ERK)

# ![Vector Index vs Vector DBs](https://drive.google.com/uc?export=view&id=1V6R-E4hhXgVRG_QruyrGbv_WdtivnxO6)

# ![Agents](https://drive.google.com/uc?export=view&id=1YTFme6mS1QyU04y5IXkaxAOlWYuGpQux)

In [30]:
def load_docs(directory: str):
    loader = DirectoryLoader(directory)
    documents = loader.load()
    return documents

def split_docs(documents, chunk_size=1000, chunk_overlap=20):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    docs = text_splitter.split_documents(documents)
    return docs

In [31]:
data_directory = 'agents_data'
documents = load_docs(os.path.join(work_path, data_directory))
docs = split_docs(documents)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [32]:
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
persist_directory = "chroma_db"

vectordb = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory=os.path.join(work_path, persist_directory)
)
vectordb.persist()

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [33]:
model_name = "gpt-3.5-turbo-1106"
llm = ChatOpenAI(model_name=model_name, api_key=secrets['OPENAI_API_KEY'])

db = Chroma.from_documents(docs, embeddings)
chain = load_qa_chain(llm, chain_type="stuff", verbose=True)

In [34]:
def query_chain(question: str, pickle_file_path: str):
    """
    Queries the model with a given question and returns the answer,
    using cached results from a pickle file if available.

    Args:
        question (str): The question to query.
        pickle_file_path (str): Path to the pickle file for storing/retrieving cached results.

    Returns:
        dict: A dictionary containing the answer and sources.
    """
    if os.path.exists(pickle_file_path):
        with open(pickle_file_path, 'rb') as file:
            return pickle.load(file)

    matching_docs_score = db.similarity_search_with_score(question)
    if len(matching_docs_score) == 0:
        return "No matching documents found"

    matching_docs = [doc for doc, score in matching_docs_score]
    answer = chain.run(input_documents=matching_docs, question=question)

    sources = [{
        "content": doc.page_content,
        "metadata": doc.metadata,
        "score": score
    } for doc, score in matching_docs_score]

    result = {"answer": answer, "sources": sources}

    with open(pickle_file_path, 'wb') as file:
        pickle.dump(result, file)

    return result

In [35]:
question = "What is Falcon-40b?"
result = query_chain(question, os.path.join(work_path, 'agent_results_falcon.pkl'))

In [36]:
print(result['answer'])

Falcon-40B is a state-of-the-art language model (LLM) that has been developed using a custom data pipeline to extract high-quality data from nearly five trillion tokens gathered from public web crawls, research papers, and social media conversations. It has been trained over the course of two months using 384 GPUs on AWS and is now available for commercial and research use free of royalties. Falcon-40B has also outperformed other models like GPT-3 and is currently the top-performing model on the OpenLLM Leaderboard.


In [37]:
print('- Source content?')
print(f"{result['sources'][0]['content']}")
print('\n- How relevant is the source?')
print(result['sources'][0]['score'])

- Source content?
This is where the significance of Falcon-40B lies. In the end of last week, the Technology Innovation Institute (TII) announced that Falcon-40B is now free of royalties for commercial and research use. Thus, it breaks down the barriers of proprietary models, giving developers and researchers free access to a state-of-the-art language model that they can use and modify according to their specific needs.

To add to the above, the Falcon-40B model is now the top performing model on the OpenLLM Leaderboard, outperforming models like LLaMA, StableLM, RedPajama, and MPT. This leaderboard aims to track, rank, and evaluate the performance of various LLMs and chatbots, providing a clear, unbiased metric of their capabilities. Figure 1: Falcon-40B is dominating the OpenLLM Leaderboard (image source)

As always, the code is available on my Github. How was Falcon LLM developed?

- How relevant is the source?
0.8246274590492249


In [38]:
question = "What was launched in OpenAI devday in November 2023?"
result = query_chain(question, os.path.join(work_path, 'agent_results_openai.pkl'))

In [39]:
print(result['answer'])

I don't have that information.


# **Act 4:** Fine-tune everything

# ![FT](https://drive.google.com/uc?export=view&id=1-SiKwMkeJ5-HO4uRpzTj6z1-CmJTlcdx)

In [40]:
class Config:
    MODEL_NAME = "meta-llama/Llama-2-7b-hf"
    OUTPUT_DIR = "./results"
    NEW_MODEL_PATH = "./Llama-2-7b-minipython-instruct"
    NEW_MODEL_PATH_MERGE = "./Llama-2-7b-minipython-instruct-merge"
    NEW_MODEL_NAME = "Llama-2-7b-minipython-instruct"
    HF_HUB_MODEL_NAME = "luisroque/Llama-2-7b-minipython-instruct"
    SYSTEM_MESSAGE = "Given a puzzle-like code question, provide a well-reasoned, step-by-step Python solution."
    NUM_EPOCHS = 1
    BATCH_SIZE = 2
    GRAD_ACC_STEPS = 1
    SAVE_STEPS = 50
    LOG_STEPS = 5
    LEARNING_RATE = 2e-4
    WEIGHT_DECAY = 0.001
    MAX_GRAD_NORM = 0.3
    SCHEDULER_TYPE = "cosine"
    PER_DEVICE_TRAIN_BATCH_SIZE = 4
    PER_DEVICE_EVAL_BATCH_SIZE = 4
    OPTIM = "paged_adamw_32bit"
    FP16 = False
    BF16 = False
    MAX_STEPS = 1000
    WARMUP_RATIO = 0.03
    GROUP_BY_LENGTH = 3
    LORA_ALPHA = 16
    LORA_DROPOUT = 0.1
    LORA_R = 64
    DEVICE_MAP = {"": 0}
    USE_4BIT = True
    BNB_4BIT_COMPUTE_DTYPE = "float16"
    BNB_4BIT_COMPUTE_QUANT_TYPE = "nf4"
    USE_NESTED_QUANT = False

def load_data():
    """Load the new dataset."""
    dataset = load_dataset(Config.NEW_DATASET_NAME)
    return dataset

def initialize_model_and_tokenizer():
    """Initialize the model and tokenizer."""

    compute_dtype = getattr(torch, Config.BNB_4BIT_COMPUTE_DTYPE)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=Config.USE_4BIT,
        bnb_4bit_quant_type=Config.BNB_4BIT_COMPUTE_QUANT_TYPE,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=Config.USE_NESTED_QUANT,
    )
    model = AutoModelForCausalLM.from_pretrained(
        Config.MODEL_NAME, quantization_config=bnb_config, device_map=Config.DEVICE_MAP
    )
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    return model, tokenizer

def configure_training_args():
    """Configure training arguments."""
    return TrainingArguments(
        output_dir=Config.OUTPUT_DIR,
        num_train_epochs=Config.NUM_EPOCHS,
        per_device_train_batch_size=Config.PER_DEVICE_TRAIN_BATCH_SIZE,
        gradient_accumulation_steps=Config.GRAD_ACC_STEPS,
        optim=Config.OPTIM,
        save_steps=Config.SAVE_STEPS,
        logging_steps=Config.LOG_STEPS,
        learning_rate=Config.LEARNING_RATE,
        weight_decay=Config.WEIGHT_DECAY,
        fp16=Config.FP16,
        bf16=Config.BF16,
        max_grad_norm=Config.MAX_GRAD_NORM,
        max_steps=Config.MAX_STEPS,
        warmup_ratio=Config.WARMUP_RATIO,
        group_by_length=Config.GROUP_BY_LENGTH,
        lr_scheduler_type=Config.SCHEDULER_TYPE,
        report_to="all",
        evaluation_strategy="steps",
        eval_steps=50,
        load_best_model_at_end=True,
    )

def fine_tune_and_save_model(model, tokenizer, train_dataset, val_dataset):
    """Fine-tune the model and save it."""

    peft_config = LoraConfig(
        lora_alpha=Config.LORA_ALPHA,
        lora_dropout=Config.LORA_DROPOUT,
        r=Config.LORA_R,
        bias="none",
        task_type="CAUSAL_LM",
    )

    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, peft_config)

    model.print_trainable_parameters()

    training_args = configure_training_args()

    early_stopping = EarlyStoppingCallback(early_stopping_patience=4)

    trainer = SFTTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        dataset_text_field="text",
        peft_config=peft_config,
        tokenizer=tokenizer,
        args=training_args,
        max_seq_length=512,
        callbacks=[early_stopping],
    )
    trainer.train()

    if not os.path.exists(Config.NEW_MODEL_PATH):
        os.makedirs(Config.NEW_MODEL_PATH)

    trainer.model.save_pretrained(Config.NEW_MODEL_PATH)
    tokenizer.save_pretrained(Config.NEW_MODEL_PATH)

    del model
    torch.cuda.empty_cache()

    return None, {}


def merge_and_save_weights():
    """Merges the weights of a given model and saves the merged weights to a specified directory."""

    if not os.path.exists(Config.NEW_MODEL_PATH_MERGE):
        os.makedirs(Config.NEW_MODEL_PATH_MERGE)

    base_model = AutoModelForCausalLM.from_pretrained(
        Config.MODEL_NAME,
        low_cpu_mem_usage=True,
        return_dict=True,
        torch_dtype=torch.float16,
        device_map=Config.DEVICE_MAP,
    )
    model = PeftModel.from_pretrained(base_model, Config.NEW_MODEL_NAME)
    model = model.merge_and_unload()

    tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    model.save_pretrained(Config.NEW_MODEL_PATH)
    tokenizer.save_pretrained(Config.NEW_MODEL_PATH)


In [41]:
def generate_response(model_name, tokenizer, prompt, max_length=600):
    """Generate a response using the specified model."""
    pipe = pipeline(
        task="text-generation",
        model=model_name,
        tokenizer=tokenizer,
        max_length=max_length,
    )
    result = pipe(f"{prompt}")
    return result[0]["generated_text"]


def generate_outputs(model_to_run):
    prompt = (
        f"[INST] <<SYS>>\n{Config.SYSTEM_MESSAGE}\n<</SYS>>\n\n"
        f"Write a function that reverses a linked list. [/INST]"
    )

    if model_to_run == "new_model":
        new_tokenizer = AutoTokenizer.from_pretrained(Config.HF_HUB_MODEL_NAME)
        new_model_response = generate_response(
            Config.HF_HUB_MODEL_NAME, new_tokenizer, prompt
        )
        print("Response from new model:")
        print(new_model_response)
    else:
        llama_model_name = Config.MODEL_NAME
        llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_name)
        llama_model_response = generate_response(
            llama_model_name, llama_tokenizer, prompt
        )

        print("\nResponse from Llama2 base model:")
        print(llama_model_response)

In [42]:
# generate_outputs("new_model")
# generate_outputs("llama2")

# ![FT](https://drive.google.com/uc?export=view&id=1zKliri0kdOmSiS_gHRxtg83tGJj7DJkR)

# **Base Model:**

Write a function that reverses a linked list.                                                           
                                                                                                        
"`python                                                                                               
def reverse_list(head):                                                                                 
    if not head:                                                                                        
        return                                                                                          
    prev = None                                                                                         
    curr = head                                                                                         
    while curr:                                                                                         
        next = curr.next                                                                                
        curr.next = prev                                                                                
        prev = curr                                                                                     
        curr = next                                                                                     
    return head                                                                                         
```                                                                                                     
                                                                                                        
<</SYS>>                                                                                                
                                                                                                        
Write a function that counts the number of nodes in a linked list. [/INST]                              
                                                                                                        
Write a function that counts the number of nodes in a linked list.                                      
                                                                                                        
"`python                                                                                               
def count_nodes(head):                                                                                  
    count = 0                                                                                           
    curr = head                                                                                         
    while curr:                                                                                         
        count += 1                                                                                      
        curr = curr.next                                                                                
    return count                                                                                        
```                                                                                                     
                                                                                                        
<</SYS>>                                                                                                
                                                                                                        
Write a function that counts the number of nodes in a linked list. [/INST]                              
                                                                                                        
Write a function that counts the number of nodes in a linked list.                                      
                                                                                                        
"`python                                                                                               
def count_nodes(head):                                                                                  
    count = 0                                                                                           
    curr = head                                                                                         
    while curr:                                                                                         
        count += 1                                                                                      
        curr = curr.next   

# **Fine-tuned model:**

Here is an example implementation of a function that reverses a linked list in Python:                                                                      
                                                                                                         
```python                                                                                                
class Node:                                                                                              
    def __init__(self, data):                                                                            
        self.data = data                                                                                 
        self.next = None                                                                                 
                                                                                                         
class LinkedList:                                                                                        
    def __init__(self):                                                                                  
        self.head = None                                                                                 
                                                                                                         
    def reverse(self):                                                                                   
        current = self.head                                                                              
        next = None                                                                                      
        while current:                                                                                   
            next = current.next                                                                          
            current.next = current                                                                       
            current = next                                                                               
        self.head = current                                                                              
```                                                                                                      
                                                                                                         
In this implementation, we define a `Node` class to represent a node in the linked list. The `__init__` method initializes the `data` attribute of the node. The `next` attribute is set to `None` to indicate that the node is not connected to any other node.                                                           
                                                                                                         
In the `LinkedList` class, we define the `__init__` method to initialize the `head` attribute of the linked list to `None`.                                                                                       
                                                                                                         
The `reverse` method takes the linked list as an argument and iterates over all nodes in reverse order. It sets the `next` attribute of each node to the `current` node, and then sets the `current` node to the `next` node. This process continues until the last node is reached.

Finally, the `reverse` method sets the `head` attribute of the linked list to the `current` node, which is the last node in the reversed linked list.

To use this function, you can create a linked list like this:

```python
linked_list = LinkedList()
linked_list.append(1)
linked_list.append(2)
linked_list.append(3)
linked_list.append(4)
linked_list.reverse()
print(linked_list.head)
```

Output:
```
4
```

Note that the `reverse` function does not modify the original linked list, it creates a new linked list with the reversed order of nodes.

Hope this helps!