In [None]:
import torch
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from pdf2image import convert_from_path
from transformers import AutoTokenizer, TextStreamer, pipeline
from transformers import AutoModelForCausalLM
from langchain_community.document_loaders.csv_loader import CSVLoader

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
loader = CSVLoader(file_path="Knowledge/categorical/xls/Combined_Combined_combined_data (1).csv/csv/summary_statistics.csv")

data = loader.load()

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("Bussiness_facility/longnet_paper.pdf")

data = loader.load()

In [None]:
embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-xl", model_kwargs={"device": "cuda"}
)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(data)
len(texts)

In [None]:
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(texts, embeddings)

In [None]:
model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
model_basename = "model"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True,auth_token = "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")

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    device_map='auto',
    torch_dtype=torch.float16, 
)

In [None]:
DEFAULT_SYSTEM_PROMPT = """
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
""".strip()


def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
[INST] <<SYS>>
{system_prompt}
<</SYS>>

{prompt} [/INST]
""".strip()

In [None]:
#my testing

DEFAULT_SYSTEM_PROMPT = """
#
""".strip()


def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
[INST] <<SYS>>
{system_prompt}
<</SYS>>

{prompt} [/INST]
""".strip()

In [None]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

In [None]:
text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1024,
    temperature=0.5,
    top_p=0.95,
    repetition_penalty=1.15,
    streamer=streamer,
)

In [None]:
llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0.6})

In [None]:
SYSTEM_PROMPT="""
As a seasoned Data Scientist, your role is to provide a clear and concise summery statistics of the dataset 
based on user prompts,  ensuring a focus on relevant insights. 
Please don't provide the false information.
"""

template = generate_prompt(
"""
{context}

Question: {question}
""",
    system_prompt=SYSTEM_PROMPT,
)

In [None]:
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)

In [None]:
prompt="can you able to tell me the summary statistic"
result = qa_chain(prompt)

# With Multiple files

In [None]:
import torch
from langchain import HuggingFacePipeline, PromptTemplate
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from pdf2image import convert_from_path
from transformers import AutoTokenizer, TextStreamer, pipeline

In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders.csv_loader import CSVLoader
loader = DirectoryLoader("Knowledge/categorical/xls/Combined_combined_data.csv/csv", glob='**/*.csv', loader_cls=CSVLoader)

data = loader.load()

In [None]:
len(data)

In [None]:
embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-xl", model_kwargs={"device": "cuda"}
)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(data)
len(texts)

In [None]:
from langchain_community.vectorstores import Chroma
db = Chroma.from_documents(texts, embeddings)

In [None]:
model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
model_basename = "model"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True,auth_token = "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")

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    device_map='auto',
    torch_dtype=torch.float16, 
)

In [None]:
#my testing

DEFAULT_SYSTEM_PROMPT = """
#
""".strip()


def generate_prompt(prompt: str, system_prompt: str = DEFAULT_SYSTEM_PROMPT) -> str:
    return f"""
[INST] <<SYS>>
{system_prompt}
<</SYS>>

{prompt} [/INST]
""".strip()

In [None]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

In [None]:
text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1024,
    temperature=0.5,
    top_p=0.95,
    repetition_penalty=1.15,
    streamer=streamer,
)

In [None]:
llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0.6})

In [None]:
SYSTEM_PROMPT="""
As a seasoned Data Scientist, your role is to provide a clear and concise summery statistics of the dataset 
based on user prompts,  ensuring a focus on relevant insights. 
Please don't provide the false information.
"""

template = generate_prompt(
"""
{context}

Question: {question}
""",
    system_prompt=SYSTEM_PROMPT,
)

In [None]:
prompt = PromptTemplate(template=template, input_variables=["context", "question"])

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={"k": 2}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt},
)

In [None]:
prompt="can you able to tell me the summary statistic"
result = qa_chain(prompt)

# opps

In [None]:
import os
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from llama_setup import LlamaLanguageModel
import torch
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
class Chatbot:
    def __init__(self):
        self.data = []
        self.llm = LlamaLanguageModel().llm
        self.qa_chain = None
        self.db = None
        self.template = self.generate_prompt("{context}\nQuestion: {question}")
        self.prompt = PromptTemplate(template=self.template, input_variables=["context", "question"])
    def load_data_from_csv(self,file_path):
        data = []
        if os.path.exists(file_path):
            loader = CSVLoader(file_path=file_path)
            data = loader.load()
        else:
            print(f"Warning: File not found at {file_path}")
        return data

    def load_summary_statistics(self, file_path):
        self.data.extend(self.load_data_from_csv(file_path))


    def setup_qa_chain(self):
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.db.as_retriever(search_kwargs={"k": 2}),
            return_source_documents=True,
            chain_type_kwargs={"prompt": self.prompt},
        )

    def apply_rag(self, question):
        result = self.qa_chain(question)
#         return result

    def generate_prompt(self, prompt: str, system_prompt: str = None) -> str:
        system_prompt="""
        As a seasoned Data Scientist, your role is to provide a clear and concise summery statistics of the dataset 
        based on user prompts,  ensuring a focus on relevant insights. 
        Please don't provide the false information.
        """
        return f"""
        [INST] <<SYS>>
        {system_prompt}
        <</SYS>>
        
        {prompt} [/INST]
        """.strip()

    def setup_chroma_db(self):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
        texts = text_splitter.split_documents(self.data)
        self.db = Chroma.from_documents(texts, HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", model_kwargs={"device": "cuda"}))


In [None]:
from chatbot import Chatbot
# Example usage
chatbot = Chatbot()

# Load summary statistics data
summary_statistics_file_path = "Knowledge/categorical/xls/Combined_Combined_combined_data (1).csv/csv/summary_statistics.csv"
chatbot.load_summary_statistics(summary_statistics_file_path)

# Setup Chroma DB
chatbot.setup_chroma_db()

# Setup QA chain for RAG
chatbot.setup_qa_chain()

# while True:
#     user_input = input("You: ")
#     if user_input.lower() == "exit":
#         print("Chat ended.")
#         break
#     else:
#         answer = chatbot.apply_rag(user_input)

In [None]:
import os
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, pipeline
from langchain import HuggingFacePipeline
from langchain_community.document_loaders.csv_loader import CSVLoader
import torch
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings

class Chatbot:
    def __init__(self):
        self.data = []
        self.llm = self.setup_language_model()
        self.qa_chain = None
        self.db = None
        self.template = self.generate_prompt("{context}\nQuestion: {question}")
        self.prompt = PromptTemplate(template=self.template, input_variables=["context", "question"])

    def load_data_from_csv(self, file_path):
        data = []
        if os.path.exists(file_path):
            loader = CSVLoader(file_path=file_path)
            data = loader.load()
        else:
            print(f"Warning: File not found at {file_path}")
        return data

    def load_summary_statistics(self, file_path):
        self.data.extend(self.load_data_from_csv(file_path))

    def setup_qa_chain(self):
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.db.as_retriever(search_kwargs={"k": 2}),
            return_source_documents=True,
            chain_type_kwargs={"prompt": self.prompt},
        )

    def apply_rag(self, question):
        result = self.qa_chain(question)
        # return result

    def generate_prompt(self, prompt: str, system_prompt: str = None) -> str:
        system_prompt = """
        As a seasoned Data Scientist, your role is to provide a clear and concise summary statistics of the dataset 
        based on user prompts, ensuring a focus on relevant insights. 
        Please don't provide the false information.
        """
        return f"""
        [INST] <<SYS>>
        {system_prompt}
        <</SYS>>
        
        {prompt} [/INST]
        """.strip()

    def setup_chroma_db(self):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
        texts = text_splitter.split_documents(self.data)
        self.db = Chroma.from_documents(texts, HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", model_kwargs={"device": "cuda"}))

    def setup_language_model(self):
        model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, auth_token="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")
        model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='auto', torch_dtype=torch.float16)
        streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
        text_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1024, temperature=0.5, top_p=0.95, repetition_penalty=1.15, streamer=streamer)
        llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0.6})
        return llm


In [None]:
from chatbot import Chatbot
chatbot = Chatbot()
summary_statistics_file_path = "Knowledge/categorical/xls/Combined_Combined_combined_data (1).csv/csv/summary_statistics.csv"
chatbot.load_summary_statistics(summary_statistics_file_path)
chatbot.setup_chroma_db()

# Setup QA chain for RAG
chatbot.setup_qa_chain()


In [None]:
# from chatbot import Chatbot
from Llama import LlamaInference
# Example usage
# chatbot = Chatbot()
llama_inference = LlamaInference("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")
# Load summary statistics data
summary_statistics_file_path = "Knowledge/categorical/xls/Combined_Combined_combined_data (1).csv/csv/summary_statistics.csv"
llama_inference.load_summary_statistics(summary_statistics_file_path)

# Setup Chroma DB
llama_inference.setup_chroma_db()

# Setup QA chain for RAG
llama_inference.setup_qa_chain()

# while True:
#     user_input = input("You: ")
#     if user_input.lower() == "exit":
#         print("Chat ended.")
#         break
#     else:
#         answer = chatbot.apply_rag(user_input)

In [None]:
user_input =  "tell me the bg value mean"
answer=chatbot.apply_rag(user_input)

In [None]:
prompt = "tell me the risk mAXvalue"
# Generate questions based on the data from the CSV file
generated_questions = chatbot.generate_questions(prompt)

# with llama setip_new file

In [None]:
import os
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from llama_setup_new import LlamaLanguageModel
import torch
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
class Chatbot:
    def __init__(self):
        self.data = []
        self.llm = LlamaLanguageModel().llm
        self.qa_chain = None
        self.db = None
        self.template = self.generate_prompt("{context}\nQuestion: {question}")
        self.prompt = PromptTemplate(template=self.template, input_variables=["context", "question"])
    def load_data_from_csv(self,file_path):
        data = []
        if os.path.exists(file_path):
            loader = CSVLoader(file_path=file_path)
            data = loader.load()
        else:
            print(f"Warning: File not found at {file_path}")
        return data

    def load_summary_statistics(self, file_path):
        self.data.extend(self.load_data_from_csv(file_path))


    def setup_qa_chain(self):
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.db.as_retriever(search_kwargs={"k": 2}),
            return_source_documents=True,
            chain_type_kwargs={"prompt": self.prompt},
        )

    def apply_rag(self, question):
        result = self.qa_chain(question)
#         return result

    def generate_prompt(self, prompt: str, system_prompt: str = None) -> str:
        system_prompt="""
        As a seasoned Data Scientist, your role is to provide a clear and concise summery statistics of the dataset 
        based on user prompts,  ensuring a focus on relevant insights. 
        Please don't provide the false information.
        """
        return f"""
        [INST] <<SYS>>
        {system_prompt}
        <</SYS>>
        
        {prompt} [/INST]
        """.strip()

    def setup_chroma_db(self):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
        texts = text_splitter.split_documents(self.data)
        self.db = Chroma.from_documents(texts, HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl", model_kwargs={"device": "cuda"}))


In [None]:
# Example usage
chatbot = Chatbot()

# Load summary statistics data
summary_statistics_file_path = "Knowledge/categorical/xls/Combined_Combined_combined_data (1).csv/csv/summary_statistics.csv"
chatbot.load_summary_statistics(summary_statistics_file_path)

# Setup Chroma DB
chatbot.setup_chroma_db()

# Setup QA chain for RAG
chatbot.setup_qa_chain()

while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Chat ended.")
        break
    else:
        answer = chatbot.apply_rag(user_input)

In [None]:
from Llama import LlamaInference

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader
loader = CSVLoader(file_path='Knowledge/categorical/xls/Combined_Diabeties.csv/csv/summary_statistics.csv')
data = loader.load()

In [None]:
llama_inference = LlamaInference("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",data)

In [None]:
llama_inference.setup_language_model()

In [None]:
llama_inference.setup_chroma_db()

In [None]:
llama_inference.setup_qa_chain()

### chatbot_Test file

In [None]:
# Instantiate the Chatbot class with the path to your CSV file
from chatbot_Test import Chatbot
csv_file_path = "Knowledge/categorical/xls/Combined_Combined_combined_data (1).csv/csv/summary_statistics.csv"
chatbot = Chatbot(csv_file_path)

In [None]:
user_input =  "tell me the bg value mean"
answer=chatbot.apply_rag(user_input)

## chatbot_Test file_new

In [None]:
# Instantiate the Chatbot class with the path to your CSV file
from chatbot_Test import Chatbot
csv_file_path = "Knowledge/categorical/xls/Combined_Combined_combined_data (1).csv/csv/summary_statistics.csv"
chatbot = Chatbot(csv_file_path)

In [None]:
from chatbot_Test_new import Chatbot

# Example usage:
if __name__ == "__main__":
    chatbot = Chatbot("Knowledge/categorical/xls/Combined_Combined_combined_data (1).csv/csv/summary_statistics.csv")

    query = "What are the introduction of it"
    correlation_prompt = chatbot.generate_summary(query)

    query = "What are the max value of the Bg"
    result = chatbot.generate_questions(query)


# chatbot_Test file_new1

In [1]:
from chatbot_Test_new1 import Chatbot
chatbot = Chatbot()

# query = "What are the introduction of it"
# correlation_prompt = chatbot.generate_summary(query)

# query = "What are the max value of the Bg"
# result = chatbot.generate_questions("Knowledge/categorical/xls/Combined_Combined_combined_data (1).csv/csv/summary_statistics.csv",query)


2024-04-19 17:48:40.109573: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-19 17:48:40.153825: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
query = "Write me the introduction of the file"
result = chatbot.generate_summary("Knowledge/categorical/xls/Combined_Combined_combined_data (1).csv/csv/summary_statistics.csv",query)

load INSTRUCTOR_Transformer


  return self.fget.__get__(instance, owner)()


max_seq_length  512
 Sure! Based on the provided data, here are some key statistics and insights that can be derived:

Introduction:
This dataset contains information on various physiological variables related to diabetes management, including blood glucose (BG) levels, carbohydrate grams (CGM), insulin dosage, and risk factors for complications. The dataset consists of 7 individuals, with varying ages and gender distributions.

Minimum and Maximum Values:
The minimum value observed in the dataset is BG = 6.601302710067708 for individual ID = 1, while the maximum value is BG = 115.95510802801054 for individual ID = 6. Similarly, the minimum CGM value is 39.0 for individual ID = 1, while the maximum value is 297.5225733305358 for individual ID = 6.

Mean Values:
The mean BG level across all individuals is BG = 111.95510802801054, while the mean CGM level is 115.45124494715583. The mean insulin dose per day is 0.015444977772516803, and the mean LBGI value is 2.975225733305358. Finally, t

In [2]:
query = "Write me the top most important features of this file"
result = chatbot.generate_corr_imp_features("Knowledge/time series/xls/Combined_Diabeties.csv/csv/all_correlated_features_with_Risk.csv",query)

load INSTRUCTOR_Transformer


  return self.fget.__get__(instance, owner)()


max_seq_length  512
 As an expert data scientist, I can tell you that the topmost important features of the given file are:

1. Risk: 1.0 - This feature has a value of 1.0, indicating that the file belongs to the "Risk" category.
2. LBGI - This feature has a value of 0.9575936074358368, indicating that the file is related to the "LBGI" topic.

These two features are the most important ones in the file, as they provide valuable information about the categories and topics associated with the file.
