<a href="https://colab.research.google.com/github/Namrahh/AI-Projects/blob/main/clean_LLM_searchengine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files

# Upload your PDF file
uploaded = files.upload()

In [None]:
!pip install PyPDF2

In [None]:
import PyPDF2

# List of PDF paths (the keys of the uploaded files)
pdf_paths = list(uploaded.keys())

# Extract text from each PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            text += page.extract_text()
    return text

# Dictionary to store extracted text from each PDF
pdf_texts = {}

# Loop through each uploaded PDF file
for pdf_path in pdf_paths:
    pdf_texts[pdf_path] = extract_text_from_pdf(pdf_path)
    print(f"Extracted Text from {pdf_path} (sample):", pdf_texts[pdf_path][:500])  # Print a small sample of the extracted text

# Optionally, if you want to combine the text from all PDFs into one large corpus
combined_text = "\n".join(pdf_texts.values())

# Now you can continue with your further processing using `pdf_texts` or `combined_text`


In [None]:
# Generate chunks from the PDF text
# Concatenate all text into a single string before splitting
all_text = " ".join(pdf_texts.values())

def split_text_into_chunks(text, chunk_size=1000):
    """Splits the text into chunks of a specified size.

    Args:
        text: The text to split.
        chunk_size: The desired size of each chunk. Defaults to 1000.

    Returns:
        A list of text chunks.
    """
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

chunks = list(split_text_into_chunks(all_text))
print("Number of Chunks:", len(chunks))
print("Sample Chunk:", chunks[0])

In [None]:
from gensim.utils import simple_preprocess

# Preprocess the chunks
preprocessed_corpus = [simple_preprocess(chunk) for chunk in chunks]
print("Preprocessed Corpus Sample:", preprocessed_corpus[:2])  # Print first two processed chunks

In [None]:
from gensim.models import Word2Vec

# Train Word2Vec model
word2vec_model = Word2Vec(sentences=preprocessed_corpus, vector_size=100, window=5, min_count=1, workers=4)
print("Word2Vec Model Trained")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Generate document vectors by averaging word embeddings
def compute_doc_vector(doc, model):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Generate vectors for all chunks
document_vectors = [compute_doc_vector(doc, word2vec_model) for doc in preprocessed_corpus]

# Search Function
def search(query, model, document_vectors, original_chunks):
    # Preprocess and vectorize query
    query_vector = compute_doc_vector(simple_preprocess(query), model)

    # Compute cosine similarity
    similarities = cosine_similarity([query_vector], document_vectors)

    # Rank results
    ranked_indices = np.argsort(similarities[0])[::-1]

    # Retrieve results
    results = [(original_chunks[i], similarities[0][i]) for i in ranked_indices[:5]]  # Top 5 results
    return results

# Test the search
query = "What is deep learning?"
results = search(query, word2vec_model, document_vectors, chunks)
print("Search Results:")
for result in results:
    print(result)


In [None]:
!pip install datasets

In [None]:
!pip install datasets transformers

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming you have your dataset in `custom_dataset`
# Define custom_dataset first
custom_dataset = Dataset.from_dict({"text": chunks, "label": [0] * len(chunks)})
# Get the indices for the train/val split
indices = np.arange(len(custom_dataset))
train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)  # 80-20 split, adjust as necessary

# Create the train and validation datasets using the indices
train_data = custom_dataset.select(train_indices)
val_data = custom_dataset.select(val_indices)

In [None]:
print(train_data)
print(val_data)

In [None]:
print(f"Training samples: {len(train_data)}")
print(f"Validation samples: {len(val_data)}")

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,  EvalPrediction, EarlyStoppingCallback

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = custom_dataset.map(tokenize_function, batched=True)

# Define compute_metrics function
def compute_metrics(eval_pred: EvalPrediction):
  """
  Calculates and returns a dictionary of metrics.

  Args:
    eval_pred: An EvalPrediction object containing predictions and labels.

  Returns:
    A dictionary of metrics, e.g., {'accuracy': 0.85}.
  """
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  accuracy = np.mean(predictions == labels)
  return {'accuracy': accuracy}

# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenized train and validation datasets
tokenized_train = train_data.map(tokenize_function, batched=True)
tokenized_val = val_data.map(tokenize_function, batched=True)

# Set format for Trainer compatibility
tokenized_train = tokenized_train.with_format("torch")
tokenized_val = tokenized_val.with_format("torch")

training_args = TrainingArguments(
    output_dir="./results",               # Output directory
    evaluation_strategy="epoch",          # Evaluate every epoch
    save_strategy="epoch",                # Save model after every epoch
    num_train_epochs=5,                   # Number of epochs
    per_device_train_batch_size=4,        # Batch size
    per_device_eval_batch_size=4,         # Eval batch size
    save_total_limit=2,                   # Limit the number of saved models
    load_best_model_at_end=True,          # Load the best model based on validation loss
    metric_for_best_model="eval_loss",    # Metric to choose the best model
    greater_is_better=False,              # Lower loss is better
    logging_dir='./logs',                 # Directory to save logs
    logging_steps=100,                    # Frequency of logging
    weight_decay=0.01,                    # Add weight decay (regularization)
    warmup_steps=500,                     # Learning rate warmup
    report_to="none",                     # Disable wandb logging
    disable_tqdm=True                     # Disable progress bar
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

trainer.train()


In [None]:
# Evaluate the model on the validation set using the tokenized validation data
results = trainer.evaluate(eval_dataset=tokenized_val)

# Print the evaluation results
print(f"Validation Results: {results}")

In [None]:
!pip install update gpt4all

In [None]:
from gpt4all import GPT4All
model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # downloads / loads a 4.66GB LLM
with model.chat_session():
    print(model.generate("How can I run LLMs efficiently on my laptop?", max_tokens=1024))

In [None]:
# Summarize search results
def summarize_results(results):
    relevant_text = " ".join([result[0] for result in results])

    # Load the GPT4All model here within the function
    from gpt4all import GPT4All
    gpt4_model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # Re-load the model here
    with gpt4_model.chat_session(): # Using gpt4_model instead of model
        summary = gpt4_model.generate(f"Summarize this: {relevant_text}", max_tokens=1024) # Using gpt4_model instead of model
    return summary


# Example dummy results for testing
results = [("This is the first document text.",), ("This is the second document text.",)]

# Get and summarize search results
summary = summarize_results(results)
print("Summary:", summary)

In [None]:
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, "rb") as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text()
            if not text:
                raise ValueError("No text found in the PDF")
    except Exception as e:
        print(f"Error: {e}")
        text = ""
    return text


In [None]:
def custom_search_engine(query):
    # Search relevant chunks
    results = search(query, word2vec_model, document_vectors, chunks)

    # Summarize results using GPT-4
    summary = summarize_results(results)
    return summary

# Testing the workflow
query = "Explain the role of AI in media."
response = custom_search_engine(query)
print("Response:", response)
