In [None]:
# Install libraries
!pip install faiss-cpu sentence-transformers transformers




In [None]:
from google.colab import files
uploaded = files.upload()


Saving preprocessing_knowledge_base.json to preprocessing_knowledge_base (1).json


In [None]:
print(uploaded.keys())  # Should display the uploaded file's name


dict_keys(['preprocessing_knowledge_base (1).json'])


In [None]:
import json

file_path = "preprocessing_knowledge_base (1).json"  # Replace with the correct file name
with open(file_path, "r") as file:
    knowledge_base = json.load(file)

# Verify the structure of the loaded knowledge base
print(knowledge_base.keys())  # Should display the top-level keys


dict_keys(['missing_values', 'outliers', 'scaling', 'encoding', 'domain_specific'])


In [None]:
documents = []
for category, subcategories in knowledge_base.items():
    for sub_category, techniques in subcategories.items():
        for technique, details in techniques.items():
            if isinstance(details, dict):
                documents.append({
                    "content": details["description"],
                    "category": category,
                    "sub_category": sub_category,
                    "technique": technique,
                    "when_to_use": details.get("when_to_use", "N/A"),
                    "how_to_use": details.get("how_to_use", "N/A"),
                    "limitations": details.get("limitations", "N/A"),
                })

# Verify the flattened structure
print(f"Flattened {len(documents)} documents.")
print(documents[:3])  # Display the first 3 documents for verification


Flattened 8 documents.
[{'content': 'Replace missing numeric values with the mean of the column.', 'category': 'missing_values', 'sub_category': 'numeric', 'technique': 'mean_imputation', 'when_to_use': 'When the data is symmetrically distributed without significant outliers.', 'how_to_use': 'Use tools like SimpleImputer in scikit-learn or pandas fillna().', 'limitations': 'Not suitable for skewed distributions as it can distort data.'}, {'content': 'Replace missing numeric values with the median of the column.', 'category': 'missing_values', 'sub_category': 'numeric', 'technique': 'median_imputation', 'when_to_use': 'When the data is skewed or contains outliers.', 'how_to_use': "Use SimpleImputer with strategy='median' in scikit-learn.", 'limitations': 'May not capture the central tendency for multimodal distributions.'}, {'content': 'Fill missing values using the k-nearest neighbors algorithm.', 'category': 'missing_values', 'sub_category': 'numeric', 'technique': 'knn_imputation', '

In [None]:
# Extract the "content" field for generating embeddings
texts = [doc["content"] for doc in documents]
print(f"Prepared {len(texts)} documents for embedding.")

Prepared 8 documents for embedding.


In [None]:
# Step 3: Generate Embeddings for Documents
from sentence_transformers import SentenceTransformer

# Load embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the documents
document_embeddings = embedding_model.encode(texts)
print("Generated embeddings for the knowledge base.")


Generated embeddings for the knowledge base.


In [None]:
#Create and Save a FAISS Index
import faiss
import numpy as np

# Create FAISS index
dimension = document_embeddings.shape[1]  # Size of embedding vector
index = faiss.IndexFlatL2(dimension)      # Use L2 distance metric
index.add(np.array(document_embeddings))  # Add embeddings to the index

# Save the FAISS index for reuse
faiss.write_index(index, "knowledge_base_index.faiss")
print("FAISS index created and saved.")

# Optional: Download the index
from google.colab import files
files.download("knowledge_base_index.faiss")


FAISS index created and saved.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#Query the Knowledge Base
# Encode the query
query = "How should I handle missing values in numeric data?"
query_embedding = embedding_model.encode([query])

# Search the FAISS index
k = 3  # Number of top results
distances, indices = index.search(np.array(query_embedding), k)

# Retrieve the corresponding documents
retrieved_docs = [texts[i] for i in indices[0]]
print("Relevant Knowledge:")
for i, doc in enumerate(retrieved_docs, start=1):
    print(f"{i}. {doc}")

Relevant Knowledge:
1. Use domain-specific knowledge to impute missing values.
2. Replace missing numeric values with the mean of the column.
3. Replace missing numeric values with the median of the column.


In [None]:
!pip install optimum

Collecting optimum
  Downloading optimum-1.23.3-py3-none-any.whl.metadata (20 kB)
Collecting coloredlogs (from optimum)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting datasets (from optimum)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->optimum)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->optimum)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->optimum)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets->optimum)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.8.0->optimum)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading optimum-1.23.3-py3-none-any.whl (424 kB)
[2K   

In [None]:

!pip install transformers accelerate bitsandbytes auto-gptq


Collecting auto-gptq
  Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting gekko (from auto-gptq)
  Downloading gekko-1.2.1-py3-none-any.whl.metadata (3.0 kB)
Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.5/23.5 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gekko-1.2.1-py3-none-any.whl (13.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge, gekko, auto-gptq
Successfully installed auto-gptq-0.7.1 gekko-1.2.1 rouge-1.0.1


In [None]:
!pip install ctransformers


Collecting ctransformers
  Downloading ctransformers-0.2.27-py3-none-any.whl.metadata (17 kB)
Downloading ctransformers-0.2.27-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ctransformers
Successfully installed ctransformers-0.2.27


In [None]:
!wget https://huggingface.co/TheBloke/Tinyllama-2-1b-miniguanaco-GGUF/resolve/main/tinyllama-2-1b-miniguanaco.Q4_K_M.gguf?download=true -O tinyllama-2-1b-miniguanaco.Q4_K_M.gguf

--2024-11-18 15:02:39--  https://huggingface.co/TheBloke/Tinyllama-2-1b-miniguanaco-GGUF/resolve/main/tinyllama-2-1b-miniguanaco.Q4_K_M.gguf?download=true
Resolving huggingface.co (huggingface.co)... 13.35.210.61, 13.35.210.66, 13.35.210.114, ...
Connecting to huggingface.co (huggingface.co)|13.35.210.61|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.hf.co/repos/f1/9b/f19bdf96e49f3feff6d726558f834c9f3fc8bf8505741527c079d3663b0455b0/56c7638cbc63f03895ac817c6b3d7fba31fc5c8765580cf51dc9e4832cdd56a0?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27tinyllama-2-1b-miniguanaco.Q4_K_M.gguf%3B+filename%3D%22tinyllama-2-1b-miniguanaco.Q4_K_M.gguf%22%3B&Expires=1732201359&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMjIwMTM1OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9mMS85Yi9mMTliZGY5NmU0OWYzZmVmZjZkNzI2NTU4ZjgzNGM5ZjNmYzhiZjg1MDU3NDE1MjdjMDc5ZDM2NjNiMDQ1NWIwLzU2Yzc2MzhjYm

In [None]:
#Generate Responses with TinyLLaMA (GGUF)
from ctransformers import AutoModelForCausalLM

# Download GGUF Model
!wget https://huggingface.co/TheBloke/Tinyllama-2-1b-miniguanaco-GGUF/resolve/main/tinyllama-2-1b-miniguanaco.Q4_K_M.gguf -O tinyllama-gguf-model.gguf

# Load the GGUF model
model_path = "tinyllama-gguf-model.gguf"
model = AutoModelForCausalLM.from_pretrained(model_path, model_type="llama")

# Generate response
input_text = f"Question: {query}\n\nRelevant Knowledge:\n" + "\n".join(retrieved_docs)
response = model(input_text, max_new_tokens=300)
print("\nGenerated Response:")
print(response)

--2024-11-19 14:48:00--  https://huggingface.co/TheBloke/Tinyllama-2-1b-miniguanaco-GGUF/resolve/main/tinyllama-2-1b-miniguanaco.Q4_K_M.gguf
Resolving huggingface.co (huggingface.co)... 3.165.160.59, 3.165.160.11, 3.165.160.12, ...
Connecting to huggingface.co (huggingface.co)|3.165.160.59|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.hf.co/repos/f1/9b/f19bdf96e49f3feff6d726558f834c9f3fc8bf8505741527c079d3663b0455b0/56c7638cbc63f03895ac817c6b3d7fba31fc5c8765580cf51dc9e4832cdd56a0?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27tinyllama-2-1b-miniguanaco.Q4_K_M.gguf%3B+filename%3D%22tinyllama-2-1b-miniguanaco.Q4_K_M.gguf%22%3B&Expires=1732286880&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMjI4Njg4MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9yZXBvcy9mMS85Yi9mMTliZGY5NmU0OWYzZmVmZjZkNzI2NTU4ZjgzNGM5ZjNmYzhiZjg1MDU3NDE1MjdjMDc5ZDM2NjNiMDQ1NWIwLzU2Yzc2MzhjYmM2M2YwMzg5NWFjODE3Y

In [None]:
# Automate Preprocessing Tasks
import pandas as pd

# Upload and Load Dataset
uploaded_dataset = files.upload()
dataset_path = list(uploaded_dataset.keys())[0]
df = pd.read_csv(dataset_path)

# Automate preprocessing based on LLM response
def apply_preprocessing(df, response):
    if "mean imputation" in response.lower():
        df.fillna(df.mean(), inplace=True)
    elif "drop missing values" in response.lower():
        df.dropna(inplace=True)
    elif "median imputation" in response.lower():
        df.fillna(df.median(), inplace=True)
    return df

df = apply_preprocessing(df, response)
df.to_csv("cleaned_dataset.csv", index=False)
files.download("cleaned_dataset.csv")
print("Preprocessing complete. Cleaned dataset saved.")

Saving fraud_data.csv to fraud_data (1).csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Preprocessing complete. Cleaned dataset saved.


In [None]:
!zip -r rag-data-preprocessing.zip .


  adding: .config/ (stored 0%)
  adding: .config/config_sentinel (stored 0%)
  adding: .config/default_configs.db (deflated 98%)
  adding: .config/active_config (stored 0%)
  adding: .config/gce (stored 0%)
  adding: .config/hidden_gcloud_config_universe_descriptor_data_cache_configs.db (deflated 97%)
  adding: .config/.last_update_check.json (deflated 22%)
  adding: .config/.last_survey_prompt.yaml (stored 0%)
  adding: .config/.last_opt_in_prompt.yaml (stored 0%)
  adding: .config/configurations/ (stored 0%)
  adding: .config/configurations/config_default (deflated 15%)
  adding: .config/logs/ (stored 0%)
  adding: .config/logs/2024.11.15/ (stored 0%)
  adding: .config/logs/2024.11.15/14.18.45.909469.log (deflated 93%)
  adding: .config/logs/2024.11.15/14.19.07.543552.log (deflated 58%)
  adding: .config/logs/2024.11.15/14.19.18.989337.log (deflated 85%)
  adding: .config/logs/2024.11.15/14.19.20.453949.log (deflated 58%)
  adding: .config/logs/2024.11.15/14.19.33.079327.log (deflate