In [1]:
import re
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


: 

### Tokenization

In [None]:
text = "Hello, world. this is -- a test"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

In [None]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
vocab = {token:integer for integer,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 20:
        break

In [None]:
class TokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab            
        self.int_to_str = {i:s for s,i in vocab.items()}        

    def encode(self, text):         
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):         
        text = " ".join([self.int_to_str[i] for i in ids]) 

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)    
        return text

In [None]:
tokenizer = TokenizerV1(vocab)
text = """"It's the last he painted, you know," 
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)
print(tokenizer.decode(ids))

In [None]:
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

In [None]:
class TokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [item if item in self.str_to_int            #1
                        else "<|unk|>" for item in preprocessed]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)    #2
        return text

In [None]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}

print(len(vocab.items()))

In [None]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

In [None]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

tokenizer = TokenizerV2(vocab)
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))

### Embedding

In [None]:
import torch

In [None]:
input_ids = torch.tensor([[2,
                           3, 
                           5, 
                           1,
                           6]])
vocab_size = 12 # vocab_size is the number of words in your train, val and test set
output_dim = 8 # output_dim is the dimension of the word vectors you are using
# The embedding layer is loaded with Word2Vec word representations.
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)
print(embedding_layer.weight.shape)

embeddings = embedding_layer(input_ids)
print(embeddings)
print(embeddings.shape)

In [None]:
import torch
import torch.nn as nn
import tiktoken
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)    

        for i in range(0, len(token_ids) - max_length, stride):     
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):    
        return len(self.input_ids)

    def __getitem__(self, idx):         
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")                         
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)   
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,     
        num_workers=num_workers     
    )

    return dataloader

dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)      
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

In [None]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
   stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

In [None]:
# n gram
CONTEXT_SIZE = 2
test_sentence = """We are extremely agile, with very fast decision-making processes 
– that’s the culture of the business. We really spend the time to understand how our customers work, 
and their aspirations within supply chain and logistics. It’s all about a collaborative approach, 
partnering with them and working beyond just moving their manufactured items, products, or components. 
We’re part of the overall strategy and planning process with our customers
."""

ids = tokenizer.encode(test_sentence)
print(ids)

test_sentence_split = test_sentence.split()

ngrams = [
    (
        [test_sentence_split[i - j - 1] for j in range(CONTEXT_SIZE)],
        test_sentence_split[i]
    )
    for i in range(CONTEXT_SIZE, len(test_sentence_split))
]

ngrams_ids = [
    (
        [ids[i - j - 1] for j in range(CONTEXT_SIZE)],
        ids[i]
    )
    for i in range(CONTEXT_SIZE, len(ids))
]
# Print the first 3, just so you can see what they look like.
print(f"{ngrams[:3]=}")
print(f"{ngrams_ids[:3]=}")

In [None]:
# CBOW
CONTEXT_SIZE = 2
test_sentence = """We are extremely agile, with very fast decision-making processes 
– that’s the culture of the business. We really spend the time to understand how our customers work, 
and their aspirations within supply chain and logistics. It’s all about a collaborative approach, 
partnering with them and working beyond just moving their manufactured items, products, or components. 
We’re part of the overall strategy and planning process with our customers
.""".split()

data = []

for i in range(CONTEXT_SIZE, len(test_sentence) - CONTEXT_SIZE):
    context = [test_sentence[i - j - 1] for j in range(CONTEXT_SIZE)] + [
        test_sentence[i + j + 1] for j in range(CONTEXT_SIZE)
    ]
    target = test_sentence[i]
    data.append((context, target))

print(f"{data[:5]=}")

In [None]:
# embedding model
from transformers import AutoTokenizer, AutoModel
import torch

# sentences
sentences = [
    "AI technology is transforming industries.",
    "Learning new languages can be fun and rewarding.",
    "The weather today is sunny with a slight breeze.",
    "Python is a versatile programming language.",
    "Mountains provide a breathtaking view of nature.",
    "Music can uplift the mood and energize the spirit.",
    "Reading books expands knowledge and imagination.",
    "Cooking a delicious meal brings joy to many people.",
    "Traveling to new places broadens one's perspective.",
    "Exercise is essential for maintaining good health.",
    "AI advancements are revolutionizing various sectors.",
    "Discovering new languages is both enjoyable and beneficial.",
    "Today's weather is bright and breezy.",
    "Python is a powerful and flexible programming language.",
    "Hiking in the mountains offers stunning natural vistas.",
    "Listening to music can elevate your mood and energy levels.",
    "Books open up new worlds and enhance imagination.",
    "Preparing a tasty meal brings happiness to many.",
    "Exploring new destinations expands your worldview.",
    "Regular exercise is crucial for staying healthy."
]

# Load model from HuggingFace Hub
# sub word tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

print("Sentence embeddings:")
print(model_output)
print(model_output.last_hidden_state.shape)


In [None]:
model

In [None]:
tokenizer.vocab_size

In [None]:
sentences[8]

In [None]:
encoded_input['input_ids'][8]

In [None]:
tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][7])

In [None]:
tokenizer.decode(encoded_input['input_ids'][8])

In [None]:
encoded_input['attention_mask'][8]

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from adjustText import adjust_text

# Extract embeddings (usually we take the mean of the token embeddings for simplicity)
embeddings = model_output.last_hidden_state.mean(dim=1).numpy()

# Perform PCA to reduce dimensionality to 2D
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)

# Plot the embeddings
plt.figure(figsize=(10, 8))
texts = []
for i, sentence in enumerate(sentences):
    plt.scatter(reduced_embeddings[i, 0], reduced_embeddings[i, 1], label=sentence)
    texts.append(plt.text(reduced_embeddings[i, 0], reduced_embeddings[i, 1], sentence, fontsize=10))

# Adjust text to prevent overlapping
adjust_text(texts, only_move={'points':'y', 'texts':'y'}, arrowprops=dict(arrowstyle='->', color='red'))

plt.title('Sentence Embeddings Visualized using PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

embeddings = model_output.last_hidden_state.mean(dim=1).numpy()
similarity_matrix = cosine_similarity(embeddings) 

plt.figure(figsize=(12, 10))
sns.heatmap(similarity_matrix, xticklabels=sentences, yticklabels=sentences, annot=True, cmap='coolwarm') 
plt.title('Sentence Embeddings Similarity Heatmap') 
plt.show()

In [None]:
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.document_loaders import TextLoader

# Initialize OpenAI models
llm = OpenAI(model="gpt-4o-mini", temperature=0)
embedding_model = OpenAIEmbeddings()

# Load and preprocess documents
documents = [
    "AI is transforming industries by automating tasks and improving efficiency.",
    "Machine learning is a core subset of AI that uses data to make predictions.",
    "Natural Language Processing (NLP) allows computers to understand and generate human language."
]

# Split documents into chunks for vector storage
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20)
docs = [text_splitter.create_documents([doc]) for doc in documents]

# Create a vector database using FAISS
vector_db = FAISS.from_documents(docs, embedding_model)

# Create a conversation chain with retrieval capabilities
retrieval_chain = ConversationalRetrievalChain.from_llm(llm, vector_db.as_retriever())

  llm = OpenAI(model="gpt-4", temperature=0)


ValidationError: 1 validation error for OpenAI
  Value error, Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. [type=value_error, input_value={'temperature': 0, 'model...ne, 'http_client': None}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.8/v/value_error