In [None]:
import os
import openai
import pandas as pd
from openai import OpenAI

In [None]:



# Set your OpenAI API key

client = OpenAI(
    api_key="sk-proj-aUqK9ndHY2uRtmibY3HHT3BlbkFJNpEOu1WY50Bz8PuXlWE6",  # this is also the default, it can be omitted
)

# %%
def generate_embedding(text):
    """Generate embedding for a given text using OpenAI's API"""
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=[text]  # Ensure input is passed as a list
    )
    # Extract the embedding correctly
    embedding = response.data[0].embedding
    return embedding

def load_text_files_and_embed(directory):
    """Load text files from a directory and compute embeddings"""
    embeddings = []
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                text = file.read()
                text = text.replace("\n", " ")
            embedding = generate_embedding(text)
            embeddings.append({'id': filename, 'values': embedding, 'metadata': {'text': text}})
    return embeddings

# %%
folder_path = 'D:/hackbangalore/rag_data'
data = load_text_files_and_embed(folder_path)

# Convert list to DataFrame
# add column sparse_values with every values as None
for i in range(len(data)):
    data[i]['sparse_values'] = None
df = pd.DataFrame(data)
print(df.head())

# %%
import os
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)

# configure client
pc = Pinecone(api_key="f5fa0fda-521f-4e2b-959f-0ba03ea28de3")

# %%
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

# %%
index_name = 'hackbangalore-rag-trial-01'

# %%
import time

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of text-embedding-ada-002
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

# %%
# convert df to a python list, example:
# sample_doc =[
#               {
#                 "id": "item_0",
#                 "values": [
#                     0.07446312652229216,
#                     0.8866284618893006,
#                     0.5244262265711986
#                 ],
#                 "metadata": {
#                     "category": "sports",
#                     "colors": [
#                         "blue",
#                         "red",
#                         "green"
#                     ],
#                     "time_stamp": 0  }
#                }   ]

sample_doc = df.to_dict(orient='records')
print(sample_doc)
index.upsert(sample_doc)

# %%
import openai

# get api key from platform.openai.com
openai.api_key = 'sk-proj-aUqK9ndHY2uRtmibY3HHT3BlbkFJNpEOu1WY50Bz8PuXlWE6'

embed_model = "text-embedding-ada-002"

# %%
query = "In the sleepy town of Glimmerdale, Benjamin, an amateur clockmaker"

res = client.embeddings.create(
    input=[query],
    model="text-embedding-ada-002",
)

# retrieve from Pinecone
xq = res.data[0].embedding

# get relevant contexts (including the questions)
res = index.query(vector=xq, top_k=5, include_metadata=True)

# %%
res

# %%
# get list of retrieved text
contexts = [item['metadata']['text'] for item in res['matches']]

augmented_query = "\n\n---\n\n".join(contexts)+"\n\n-----\n\n"+query

# %%
print(augmented_query)

# %%
# system message to 'prime' the model
primer = f"""You are Q&A bot. A highly intelligent system that answers
user questions based on the information provided by the user above
each question. If the information can not be found in the information
provided by the user you truthfully say "I don't know".
"""

res = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {"role": "system", "content": primer},
        {"role": "user", "content": augmented_query}
    ]
)

# %%
from IPython.display import Markdown

display(Markdown(res.choices[0].message.content))

# %%

-----------


In [None]:
import parso

In [None]:
from flask import Flask, request, jsonify
import os
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

app = Flask(__name__)

# Set environment variables
os.environ['OPENAI_API_KEY'] = 'sk-proj-aUqK9ndHY2uRtmibY3HHT3BlbkFJNpEOu1WY50Bz8PuXlWE6'
os.environ['PINECONE_API_KEY'] = 'f5fa0fda-521f-4e2b-959f-0ba03ea28de3'

# Initialize OpenAI LLM
llm = OpenAI(openai_api_key=os.environ['OPENAI_API_KEY'])

# Pinecone index name
index_name = "hackbangalore-rag-trial-01"

# Initialize Pinecone vector store
vectorstore = PineconeVectorStore(index_name=index_name,
                                  embedding=OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY']))


def extract_keywords_using_chatgpt(text):
    """
    Sends a request to ChatGPT to extract keywords from the provided text.

    :param text: The text from which keywords need to be extracted.
    """
    try:
        # Invoke the LLM to get the response, which is expected to be directly the keyword string
        response = llm.invoke(
            "Extract the main keywords from the following text and provide them as a comma-separated list. do not give more than 10 keywords Please prefix the keywords with 'Keywords:'. the text to extract keyword is, " + text
        )
        print("Response from ChatGPT:", response)  # Debug: Print the full response to verify its structure
        print("Type of response:", type(response))
        print("Content of response:", response)

        # Check if the response is directly usable
        # strip /n and /r from the response
        response = response.replace('\n', '').replace('\r', '')
        if isinstance(response, str) and response.startswith('Keywords:'):
            print("Response is directly usable")
            keywords = response[len('Keywords:'):].strip()
            return [keyword.strip() for keyword in keywords.split(',')]
        elif isinstance(response, dict) and 'text' in response:
            output = response['text'].strip()
            if output.startswith('Keywords:'):
                keywords = output[len('Keywords:'):].strip()
                return [keyword.strip() for keyword in keywords.split(',')]
        else:
            print("Unexpected response structure or no keywords prefix found")
            return []
    except Exception as e:
        print(f"Error processing response: {e}")
        return []


@app.route('/search_projects', methods=['POST'])
def search_projects():
    try:
        data = request.get_json()
        description = data.get('description', '')

        # Extract keywords relevant to the description
        keywords = extract_keywords_using_chatgpt(description)

        # Perform a similarity search using the vector store with extracted keywords
        search_query = ' '.join(keywords)
        print(f"Search query: {search_query}")
        search_result = vectorstore.similarity_search(search_query)

        # Return the top 3 results, if more than 3 exist
        top_results = search_result[:3] if len(search_result) > 3 else search_result

        if top_results:
            response_data = [{"similarity_match": result.page_content} for result in top_results]
            return jsonify(response_data), 200
        else:
            return jsonify({'message': 'No results found'}), 404
    except Exception as e:
        return jsonify({'error': str(e)}), 500


if __name__ == '__main__':
    app.run(debug=True)

