# Installing Packages

In [None]:
!pip install langchain
!pip install unstructured
!pip install "unstructured[csv]"
!pip install pandas
!pip install cohere
!pip install langchain-cohere
!pip install weaviate-client==4.*
!pip install faiss-cpu
!pip install langchain cohere

Collecting langchain
  Downloading langchain-0.3.3-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.10 (from langchain)
  Downloading langchain_core-0.3.10-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.135-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4.0,>=0.3.10->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting httpx<1,>=0.23.0 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.7-cp310-cp310

# Set API Keys

In [None]:
WEAVIATE_CLUSTER = "weaviate_url"
WEAVIATE_API_KEY = 'weaviate_api'
COHERE_API_KEY = 'cohere_api'

#Install the latest version of protobuf and Unstructured Package for reading the data Correctly

In [None]:
# Uninstall the existing protobuf package
!pip uninstall google.protobuf -y

# Install the latest version of protobuf
!pip install google.protobuf

# If you want to reinstall the other libraries as well
!pip install weaviate-client langchain unstructured "unstructured[csv]" pandas cohere langchain-cohere faiss-cpu

[0m[31mERROR: Could not find a version that satisfies the requirement google.protobuf (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for google.protobuf[0m[31m
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-health-checking<2.0.0,>=1.57.0->weaviate-client)
  Using cached protobuf-5.28.2-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
INFO: pip is looking at multiple versions of sagemaker to determine which version is compatible with other requirements. This could take a while.
Collecting sagemaker<3.0.0,>=2.232.1 (from cohere)
  Downloading sagemaker-2.232.1-py3-none-any.whl.metadata (16 kB)
Collecting grpcio-tools<2.0.0,>=1.57.0 (from weaviate-client)
  Using cached grpcio_tools-1.67.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
  Downloading grpcio_tools-1.66.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
  Downloading grpcio_tools-1.66.1-cp310-cp310-manylinux_2_17_x86_64.many

**Import necessary libraries**

In [None]:
import weaviate
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.cohere import CohereEmbeddings
from langchain.vectorstores import Weaviate
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import Cohere
from langchain.schema import Document
from langchain_cohere.embeddings import CohereEmbeddings
from langchain.chains import RetrievalQA
import cohere
from langchain.vectorstores import VectorStore
from langchain.llms import Cohere as LangchainCohere

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


# Load stock dataset from CSV

In [None]:
csv_data = pd.read_csv('stock_data.csv')  # Replace with your CSV file path
# Create documents using relevant columns and convert them to LangChain Document objects
documents = [
    Document(
        page_content=f"Date: {row['Date']}, Open: {row['Open']}, High: {row['High']}, Low: {row['Low']}, Close: {row['Close']}, Volume: {row['Volume']}",
        metadata={}
    ) for _, row in csv_data.iterrows()
]

# Text splitting

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
docs = text_splitter.split_documents(documents)

# Initialize Cohere embeddings

In [None]:
embeddings = CohereEmbeddings(
    cohere_api_key=COHERE_API_KEY,
    user_agent="QA/1.0",
    model="large"
)

# Connect to Weaviate Cluster

In [None]:
auth_config = weaviate.auth.AuthApiKey(api_key=WEAVIATE_API_KEY)
client = weaviate.Client(
    url=WEAVIATE_CLUSTER,
    auth_client_secret=auth_config,
    startup_period=10
)

Python client v3 `weaviate.Client(...)` connections and methods are deprecated and will
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration

            If you have to use v3 code, install the v3 client and pin the v3 dependency in your requirements file: `weaviate-client>=3.26.7;<4.0.0`
  client = weaviate.Client(


# Clear existing schema (if necessary)

In [None]:
client.schema.delete_all()

# Define schema for storing documents

In [None]:
schema = {
    "classes": [
        {
            "class": "StockData",
            "description": "Stock market data for QA",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {"text2vec-openai": {"model": "ada", "type": "text"}},
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the stock data entry",
                    "moduleConfig": {
                        "text2vec-openai": {
                            "skip": False,
                            "vectorizePropertyName": False,
                        }
                    },
                    "name": "content",
                },
            ],
        },
    ]
}


# Create the schema

In [None]:
client.schema.create(schema)

# Create vector store using Cohere embeddings

In [None]:
vectorstore = Weaviate(client, "StockData", "content", attributes=[])

# Initialize Cohere embeddings

In [None]:
COHERE_API_KEY = "You're api"

from langchain.embeddings.cohere import CohereEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import Cohere


embeddings = CohereEmbeddings(cohere_api_key=COHERE_API_KEY, user_agent="QA/1.0")

# Define the QA chain using Cohere
chain = load_qa_chain(
    Cohere(cohere_api_key=COHERE_API_KEY, temperature=0),
    chain_type="stuff"
)


  embeddings = CohereEmbeddings(cohere_api_key=COHERE_API_KEY, user_agent="QA/1.0")
  Cohere(cohere_api_key=COHERE_API_KEY, temperature=0),
stuff: https://python.langchain.com/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/docs/how_to/#qa-with-rag
  chain = load_qa_chain(


# Correct import for Cohere embeddings

In [None]:
from langchain_cohere.embeddings import CohereEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Weaviate
import weaviate

In [None]:
import weaviate

WEAVIATE_CLUSTER = "weaviate_url"
WEAVIATE_API_KEY = 'weaviate_api'
COHERE_API_KEY = 'cohere_api'

# Initialize Weaviate client with Cohere API key for text vectorization
client = weaviate.Client(
    url=WEAVIATE_CLUSTER,
    auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY),
    additional_headers={"X-Cohere-Api-Key": COHERE_API_KEY} 
)

# Check the connection
client.is_ready()


Python client v3 `weaviate.Client(...)` connections and methods are deprecated and will
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration

            If you have to use v3 code, install the v3 client and pin the v3 dependency in your requirements file: `weaviate-client>=3.26.7;<4.0.0`
  client = weaviate.Client(


True

# Initialize Weaviate client with Cohere API key for text vectorization

In [None]:
# Initialize Weaviate client with Cohere API key for text vectorization
client = weaviate.Client(
    url=WEAVIATE_CLUSTER,
    auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY),
    additional_headers={
        "X-Cohere-Api-Key": COHERE_API_KEY
    }
)


Python client v3 `weaviate.Client(...)` connections and methods are deprecated and will
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration

            If you have to use v3 code, install the v3 client and pin the v3 dependency in your requirements file: `weaviate-client>=3.26.7;<4.0.0`
  client = weaviate.Client(


#Uploading the Data Batches

In [None]:
import pandas as pd
import weaviate
# Create a Weaviate client using the new v4 client
client = weaviate.Client(url=WEAVIATE_CLUSTER, auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY))

# Read the CSV file
df = pd.read_csv('stock_data.csv')

# Clean the DataFrame: Drop rows with NaN or infinite values
df.replace([float('inf'), float('-inf')], pd.NA, inplace=True) 
df.dropna(inplace=True)  # Droping rows with any NaN values

# Prepare your data for upload
batch_size = 100
with client.batch() as batch:
    for i, row in df.iterrows():
        try:
            # Create the data object with required fields
            data_object = {
                "date": row['Date'],
                "closing_price": row['Close'],
                "symbol": "AAPL",
            }
            batch.add_data_object(data_object, "YourClassName")
            # Upload in batches
            if (i + 1) % batch_size == 0:
                print(f"Uploaded batch: {i // batch_size + 1}")
        except Exception as e:
            print(f"Error uploading batch: {e}")

# Final upload if there are any remaining items
if len(df) % batch_size != 0:
    print(f"Uploaded final batch: {(len(df) // batch_size) + 1}")


Python client v3 `weaviate.Client(...)` connections and methods are deprecated and will
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration

            If you have to use v3 code, install the v3 client and pin the v3 dependency in your requirements file: `weaviate-client>=3.26.7;<4.0.0`
  client = weaviate.Client(url=WEAVIATE_CLUSTER, auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY))


Uploaded batch: 1
Uploaded batch: 2
Uploaded batch: 3
Uploaded batch: 4
Uploaded batch: 5
Uploaded batch: 6
Uploaded batch: 7
Uploaded batch: 8
Uploaded batch: 9
Uploaded batch: 10
Uploaded batch: 11
Uploaded batch: 12
Uploaded final batch: 13


# Check the first few rows & Info

In [None]:
print(df.head())
print(df.info())

                        Date    Adj Close  Adj Close.1  Adj Close.2  \
1  2019-01-02 00:00:00+00:00  37.75008392  76.95649719  52.16439438   
2  2019-01-03 00:00:00+00:00  33.98989868  75.01399994  50.67854309   
3  2019-01-04 00:00:00+00:00  35.44090271  78.76950073  53.40434647   
4  2019-01-07 00:00:00+00:00  35.36201096  81.47550201  53.28863144   
5  2019-01-08 00:00:00+00:00  36.03612518  82.82900238  53.68216324   

   Adj Close.3  Adj Close.4        Close      Close.1      Close.2  \
1  95.50130463  20.67466736  39.47999954  76.95649719  52.29249954   
2  91.98801422  20.02400017  35.54750061  75.01399994   50.8030014   
3  96.26630402  21.17933273  37.06499863  78.76950073  53.53549957   
4  96.38910675   22.3306675  36.98249817  81.47550201  53.41949844   
5  97.08796692  22.35666656      37.6875  82.82900238  53.81399918   

       Close.3  ...         Open       Open.1       Open.2       Open.3  \
1  101.1200027  ...  38.72249985  73.26000214  50.82849884  99.55000305   
2 

# Create a Weaviate client using the new v4 client

In [None]:
client = weaviate.Client(url=WEAVIATE_CLUSTER, auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY))

Python client v3 `weaviate.Client(...)` connections and methods are deprecated and will
            be removed by 2024-11-30.

            Upgrade your code to use Python client v4 `weaviate.WeaviateClient` connections and methods.
                - For Python Client v4 usage, see: https://weaviate.io/developers/weaviate/client-libraries/python
                - For code migration, see: https://weaviate.io/developers/weaviate/client-libraries/python/v3_v4_migration

            If you have to use v3 code, install the v3 client and pin the v3 dependency in your requirements file: `weaviate-client>=3.26.7;<4.0.0`
  client = weaviate.Client(url=WEAVIATE_CLUSTER, auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY))


# Load the CSV file & the columns to check their names

In [None]:
df = pd.read_csv('stock_data.csv')
print(df.columns.tolist())

['Date', 'Adj Close', 'Adj Close.1', 'Adj Close.2', 'Adj Close.3', 'Adj Close.4', 'Close', 'Close.1', 'Close.2', 'Close.3', 'Close.4', 'High', 'High.1', 'High.2', 'High.3', 'High.4', 'Low', 'Low.1', 'Low.2', 'Low.3', 'Low.4', 'Open', 'Open.1', 'Open.2', 'Open.3', 'Open.4', 'Volume', 'Volume.1', 'Volume.2', 'Volume.3', 'Volume.4']


# Set the first row as the header and remove it from the DataFrame

In [None]:
df.columns = df.iloc[0]  # Set the first row as the header
df = df[1:] 

# Reset the index
df.reset_index(drop=True, inplace=True)

# Check the columns again
print(df.columns.tolist())


[nan, 'Apple', 'Amazon', 'Googlele', 'Microsoft', 'Tesla', 'Apple', 'Amazon', 'Googlele', 'Microsoft', 'Tesla', 'Apple', 'Amazon', 'Googlele', 'Microsoft', 'Tesla', 'Apple', 'Amazon', 'Google', 'Microsoft', 'Tesla', 'Apple', 'Amazon', 'Google', 'Microsoft', 'Tesla', 'Apple', 'Amazon', 'Google', 'Microsoft', 'Tesla']


# Rename columns if necessary (check for any leading/trailing spaces)

In [None]:
df.columns = df.columns.str.strip()  # Remove any extra spaces
df.rename(columns={'Date': 'date', 'Adj Close': 'apple', 'Adj Close.1': 'amazon', 'Adj Close.2': 'google', 'Adj Close.3': 'microsoft', 'Adj Close.4': 'tesla'}, inplace=True)

print(df.head())


0                        NaN        Apple       Amazon     Googlele  \
0  2019-01-02 00:00:00+00:00  37.75008392  76.95649719  52.16439438   
1  2019-01-03 00:00:00+00:00  33.98989868  75.01399994  50.67854309   
2  2019-01-04 00:00:00+00:00  35.44090271  78.76950073  53.40434647   
3  2019-01-07 00:00:00+00:00  35.36201096  81.47550201  53.28863144   
4  2019-01-08 00:00:00+00:00  36.03612518  82.82900238  53.68216324   

0    Microsoft        Tesla        Apple       Amazon     Googlele  \
0  95.50130463  20.67466736  39.47999954  76.95649719  52.29249954   
1  91.98801422  20.02400017  35.54750061  75.01399994   50.8030014   
2  96.26630402  21.17933273  37.06499863  78.76950073  53.53549957   
3  96.38910675   22.3306675  36.98249817  81.47550201  53.41949844   
4  97.08796692  22.35666656      37.6875  82.82900238  53.81399918   

0    Microsoft  ...        Apple       Amazon       Google    Microsoft  \
0  101.1200027  ...  38.72249985  73.26000214  50.82849884  99.55000305   
1 

# Normalize column names by converting them to lowercase

In [None]:
df.columns = df.columns.str.lower().str.strip()

# Check the updated column names
print("Updated Columns:", df.columns.tolist())


Updated Columns: [nan, 'apple', 'amazon', 'googlele', 'microsoft', 'tesla', 'apple', 'amazon', 'googlele', 'microsoft', 'tesla', 'apple', 'amazon', 'googlele', 'microsoft', 'tesla', 'apple', 'amazon', 'google', 'microsoft', 'tesla', 'apple', 'amazon', 'google', 'microsoft', 'tesla', 'apple', 'amazon', 'google', 'microsoft', 'tesla']


# Rename specific columns for clarity

In [None]:
df.rename(columns={
    'date': 'date',
    'adj close': 'apple',
    'adj close.1': 'amazon',
    'adj close.2': 'google',
    'adj close.3': 'microsoft',
    'adj close.4': 'tesla',
    'close': 'apple_close',
    'close.1': 'amazon_close',
    'close.2': 'google_close',
    'close.3': 'microsoft_close',
    'close.4': 'tesla_close',
}, inplace=True)

# Verify the new column names
print("Renamed Columns:", df.columns.tolist())


Renamed Columns: [nan, 'apple', 'amazon', 'googlele', 'microsoft', 'tesla', 'apple', 'amazon', 'googlele', 'microsoft', 'tesla', 'apple', 'amazon', 'googlele', 'microsoft', 'tesla', 'apple', 'amazon', 'google', 'microsoft', 'tesla', 'apple', 'amazon', 'google', 'microsoft', 'tesla', 'apple', 'amazon', 'google', 'microsoft', 'tesla']


# Check the updated DataFrame

In [None]:
print(df.head())

# Check the data types
print("Data types:\n", df.dtypes)

0                        NaN        apple       amazon     googlele  \
0  2019-01-02 00:00:00+00:00  37.75008392  76.95649719  52.16439438   
1  2019-01-03 00:00:00+00:00  33.98989868  75.01399994  50.67854309   
2  2019-01-04 00:00:00+00:00  35.44090271  78.76950073  53.40434647   
3  2019-01-07 00:00:00+00:00  35.36201096  81.47550201  53.28863144   
4  2019-01-08 00:00:00+00:00  36.03612518  82.82900238  53.68216324   

0    microsoft        tesla        apple       amazon     googlele  \
0  95.50130463  20.67466736  39.47999954  76.95649719  52.29249954   
1  91.98801422  20.02400017  35.54750061  75.01399994   50.8030014   
2  96.26630402  21.17933273  37.06499863  78.76950073  53.53549957   
3  96.38910675   22.3306675  36.98249817  81.47550201  53.41949844   
4  97.08796692  22.35666656      37.6875  82.82900238  53.81399918   

0    microsoft  ...        apple       amazon       google    microsoft  \
0  101.1200027  ...  38.72249985  73.26000214  50.82849884  99.55000305   
1 

# Load your stock data into a DataFrame

In [None]:
import pandas as pd
import cohere

# Initialize Cohere API client
cohere_client = cohere.Client('cohere_api')

stock_data = pd.read_csv('stock_data.csv')
# Convert 'Date' column to datetime before entering the loop
stock_data['Date'] = pd.to_datetime(stock_data['Date'])
# Create a new column with pre-calculated date values
stock_data['Date_date'] = stock_data['Date'].dt.date

#Retrieving The data from csv

In [None]:
def retrieve_stock_data(query):
    response = cohere_client.generate(
        model='command-xlarge-nightly',
        prompt=f"Given the stock data, answer the following question: {query}",
        max_tokens=100,
        temperature=0.5
    )
    return response.generations[0].text.strip()

#Getting Info

In [None]:
def get_stock_info(date):
    if not row.empty:
        # Check if the columns exist before accessing them
        columns_to_check = ['apple_close', 'amazon_close', 'google_close', 'microsoft_close', 'tesla_close',
                            'apple_open', 'amazon_open', 'google_open', 'microsoft_open', 'tesla_open']

        info = {}
        for column in columns_to_check:
            # Use get() with a default value to avoid KeyError
            info[column.replace('_', ' ').title()] = row.get(column, None)
            # Access the first element if it exists, otherwise use None
            if info[column.replace('_', ' ').title()] is not None:
                info[column.replace('_', ' ').title()] = info[column.replace('_', ' ').title()].values[0]

        return info
    else:
        return None

# Q & A Section

In [None]:
def ask_question():
    while True:
        question = input("Ask your question (type 'exit' to stop): ")
        if question.lower() == 'exit':
            break

        if 'on' in question:
            date_str = question.split("on")[-1].strip()  # Extract the date
            date_str = date_str.split("?")[0].strip()  # Clean the date string
            try:
                # Attempt to parse the date with the original format
                date = pd.to_datetime(date_str, format='%Y-%m-%d %H:%M:%S%z').date()
            except ValueError:
                # If original format fails, try parsing with only year-month-day
                try:
                    date = pd.to_datetime(date_str, format='%Y-%m-%d').date()
                except ValueError:
                    # If both formats fail, handle as a general query
                    answer = retrieve_stock_data(question)
                    print(f"Answer: {answer}")
                    continue

            prices = get_stock_info(date)

            if prices:
                response = ", ".join([f"{k}: ${v}" for k, v in prices.items()])
                print(f"Answer: Stock prices on {date}: {response}")
            else:
                print(f"Answer: No data found for {date}.")
        else:
            # Treat the question as a general query
            answer = retrieve_stock_data(question)
            print(f"Answer: {answer}")

ask_question()

Ask your question (type 'exit' to stop): What was the closing price of Apple on 2023-10-27?
Answer: Stock prices on 2023-10-27: Apple Close: $None, Amazon Close: $None, Google Close: $None, Microsoft Close: $None, Tesla Close: $None, Apple Open: $None, Amazon Open: $None, Google Open: $None, Microsoft Open: $None, Tesla Open: $None
Ask your question (type 'exit' to stop): What was the lowest price of Microsoft on 2023-12-22?
Answer: Stock prices on 2023-12-22: Apple Close: $None, Amazon Close: $None, Google Close: $None, Microsoft Close: $None, Tesla Close: $None, Apple Open: $None, Amazon Open: $None, Google Open: $None, Microsoft Open: $None, Tesla Open: $None
Ask your question (type 'exit' to stop): How did the price of Google change between 2023-10-20 and 2023-11-03?
Answer: Between 2023-10-20 and 2023-11-03, Google's stock price experienced a notable decline. On October 20th, the closing price of Google's parent company, Alphabet Inc. (GOOGL), was around $113.40 per share. However