In [None]:
"""The aim of this project is to built a chatbot that allows user to chat with github repos.
It involves the following process:
1. Processing the Repository files
2. Saving the embeddings in Deeplake database
3. Retrieving from database based on user query"""

# Processing the Repository Files

In [2]:
import subprocess

def clone_repo(repo_url, local_path):
    """Function to clone the repo to given local path"""
    subprocess.run(['git', 'clone', repo_url, local_path])


repo_url = "https://github.com/Instein125/Speech-Denoiser-using-Deep-Learning"
local_path = 'repos/'

clone_repo(repo_url, local_path)

In [11]:
import os
from langchain.document_loaders import TextLoader

def load_docs(root_dir, file_extensions = None):
    """
    Load documents from the specified root directory.
    Optionally filter by file extensions.
    """

    docs =[]

    for dirpath, dirnames, filenames in os.walk(root_dir):
        for file in filenames:
            file_path = os.path.join(dirpath, file)

            # Skip dotfiles
            if file.startswith("."):
                continue

            if file_extensions and os.path.splitext(file)[1] not in file_extensions:
                continue

            loader = TextLoader(file_path, encoding='utf-8')
            docs.extend(loader.load_and_split())

    return docs

root_dir = "repos/"
file_extensions=['.md', '.txt', '.py']
docs = load_docs(root_dir, file_extensions)


In [12]:
print(docs)



In [13]:
from langchain.text_splitter import CharacterTextSplitter

def split_docs(docs):
    """Split the documents into chunks"""
    splitter = CharacterTextSplitter(chunk_size = 1000, chunk_overlap =0)
    texts = splitter.split_documents(docs)
    return texts


texts = split_docs(docs)

Created a chunk of size 1027, which is longer than the specified 1000
Created a chunk of size 1137, which is longer than the specified 1000
Created a chunk of size 1366, which is longer than the specified 1000
Created a chunk of size 1018, which is longer than the specified 1000
Created a chunk of size 1297, which is longer than the specified 1000


# Saving the Embeddings

In [16]:
from langchain.vectorstores import DeepLake
from langchain.embeddings import GooglePalmEmbeddings

def create_database_and_store_texts(dataset_path, texts):
    """Create an empty deeplake database in specified path"""
    embeddings = GooglePalmEmbeddings()
    db = DeepLake(dataset_path, embedding = embeddings)
    db.add_documents(texts)

    return db

my_activeloop_org_id = "samman"
my_activeloop_dataset_name = "langchain_course_chat_with_gh"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"

db = create_database_and_store_texts(dataset_path, texts)

Deep Lake Dataset in hub://samman/langchain_course_chat_with_gh already exists, loading from the storage


Creating 58 embeddings in 1 batches of size 58:: 100%|██████████| 1/1 [01:27<00:00, 87.25s/it]

Dataset(path='hub://samman/langchain_course_chat_with_gh', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
 embedding  embedding  (58, 768)  float32   None   
    id        text      (58, 1)     str     None   
 metadata     json      (58, 1)     str     None   
   text       text      (58, 1)     str     None   





# Retrieving from Database

In [25]:
from langchain_google_genai import GoogleGenerativeAI
from langchain.chains import RetrievalQA

def search_db(db, query):
    """Search for a response to the query in the DeepLake database."""
    # Create a retriever from the DeepLake instance
    retriever = db.as_retriever()

    # Set the search parameters for the retriever
    retriever.search_kwargs["distance_metric"] = "cos"
    retriever.search_kwargs["fetch_k"] = 100
    retriever.search_kwargs["maximal_marginal_relevance"] = True
    retriever.search_kwargs["k"] = 10

    model = GoogleGenerativeAI(model='gemini-pro',  
                               temperature=0,
                               )
    
    # Create a RetrievalQA instance from the model and retriever
    qa = RetrievalQA.from_llm(model, retriever=retriever)

    # Return the result of the query
    return qa.run(query)


In [26]:
query = "Explain architecture of model used in this project"
response = search_db(db, query)
print(response)

The model used in this project is a UNet architecture, which is a convolutional neural network (CNN) commonly used for image segmentation. It has been adapted here for denoising audio signals. The UNet architecture consists of a downsampling path and an upsampling path. The downsampling path extracts features from the input audio spectrograms, while the upsampling path reconstructs the denoised audio signal spectrograms. Skip connections are formed by concatenating the upsampled layer with the corresponding downsampling block's output. This structure enables the model to retain fine-grained details during reconstruction. The final layer of the UNet architecture contains a single filter with a sigmoid activation function, making it suitable for binary classification tasks, which is appropriate for the denoising objective. The model outputs denoised log spectrograms.
