In [None]:
from src import VectorDB
from weaviate.classes.config import Configure
import json
import os
import time
import weaviate
import weaviate.classes as wvc
from weaviate import WeaviateClient
from weaviate.classes.config import Configure
from dotenv import find_dotenv, load_dotenv
import os
from typing import List, Dict, Set, AnyStr
import pandas as pd
from langchain.vectorstores.weaviate import Weaviate  
from langchain.llms import Cohere  
from langchain.chains import ChatVectorDBChain  
import cohere


In [None]:
load_dotenv(find_dotenv())

In [None]:
wcd_url = os.getenv("WCD_URL")
wcd_api_key = os.getenv("WCD_API_KEY")
cohere_key = os.getenv("COHERE_KEY")

In [None]:
class VectorDB:
    def __init__(self, client: WeaviateClient):
        print("Connecting to Weaviate")
        self.client = client
        self.cohere = cohere.ClientV2(cohere_key)

    def create_collection(self, collection_name, **config):
        if not self.client.collections.exists(collection_name):
            print("Creating collection")
            self.client.collections.create(
                collection_name,
                **config,
            )
        else:
            print("Collection already exists")

    def insert_data(self, collection_name, data: Dict[AnyStr, Dict[AnyStr, AnyStr]], key_field="custom_id"):
        collection = self.client.collections.get(collection_name)
        with collection.batch.dynamic() as batch:
            for k, src_obj in data.items():
                weaviate_obj = src_obj | {key_field: k}
                batch.add_object(
                    properties=weaviate_obj,
                )

        if len(collection.batch.failed_objects):
            print(collection.batch.failed_objects)
            raise Exception("Failed to insert data")

    def search_data(self, collection_name, query: str, limit: int = 10):
        collection = self.client.collections.get(collection_name)
        response = collection.query.near_text(query=query, limit=limit)
        return response

    def generate(self, collection_name, query, limit=10, *, single_prompt=None, grouped_task=None):
        collection = self.client.collections.get(collection_name)
        response = collection.generate.near_text(
            query=query,
            limit=limit,
            single_prompt=single_prompt,
            grouped_task=grouped_task,
            grouped_properties=['title', 'description',
                                # 'skills', 'difficulty',
                                'course_id']
        )
        return response

    def get_roadmap(self, collection_name, query, knowledge, limit=10):
        json_example = """
{
  "courses": [
    "SGDf_rbfmFSHlxI-Czzlz",
    "tG5v3O4lNIFc2uCnacPak",
    "HdWq9ue0JdwmwqSIN2OD_"
  ]
}
"""
        data = self.search_data(collection_name, query, limit=limit)
        courses = list()
        for obj in data.objects:
            # print(obj.properties['course_id'])
            course_id = obj.properties['course_id']
            title = obj.properties['title']
            description = obj.properties['description'].replace('\n', ' ').strip()
            courses.append({'id': course_id, 'title': title, 'description': description})
        task = f"""You are tasked with creating a roadmap for {query}.
This is the user description of what they know you should take it into account If the user already know something then the respond shpould not include it.
USER KNOWLEDGE: {knowledge}

This is a list of the avilable courses: {courses}.
Your task is to return a proper json object containg the ids of the relevant courses example: {json_example}
Make sure the json object is valid json and to respond only with course ids."""
        response = self.cohere.chat(
            model="command-r-plus-08-2024",
            messages=[
                {
                    "role": "user",
                    "content": task
                }
            ],
            temperature=0,
            response_format={"type": "json_object"},
        )
        return response
        # return task, documents

#     def get_roadmap_prompt(self, collection_name, query, knowledge, limit=10):
#         json_example = """
# {
#   "courses": [
#     "SGDf_rbfmFSHlxI-Czzlz",
#     "tG5v3O4lNIFc2uCnacPak",
#     "HdWq9ue0JdwmwqSIN2OD_"
#   ]
# }
# """
#         task = f"""
# You are tasked with creating a roadmap for {query}.
# This is the user description of what they already know. You should take it into account If the user already know something then the respond should NOT include it.
# USER KNOWLEDGE: {knowledge}

# You will be given a group of the avilable courses.
# Your task is to return a proper json object containg the ids of the relevant courses example: {json_example}
# Make sure the json object is valid json and to respond only with course ids.
# """

    def chat(self, collection_name, query, knowledge, limit):
        client = weaviate.Client("http://localhost:8080")

        vectorstore = Weaviate(client, collection_name, "description")
        cohere = Cohere(temperature=0,
                        cohere_api_key=cohere_key)

        qa = ChatVectorDBChain.from_llm(cohere, vectorstore)
        response, prompt = self.get_roadmap(
            collection_name, query, knowledge, limit)
        query = yield response
        objs = list(dict(
            title=obj.properties['title'],
            description=obj.properties['description'],
            # skills=obj.properties['skills'],
            # difficulty=obj.properties['difficulty'],
            course_id=obj.properties['course_id'],
        ) for obj in response.objects)
        chat_history = [(f'{prompt}\n{objs}', response.generated)]

        while True:
            # query = input("")
            response = qa({"question": query, "chat_history": chat_history})
            # print(result["answer"])
            query = yield response
            chat_history.append((query, response["answer"]))

    def get_by_uuid(self, collection_name, uuid):
        collection = self.client.collections.get(collection_name)
        return collection.query.fetch_object_by_id(uuid)

    def delete_collection(self, collection_name):
        if not self.client.collections.exists(collection_name):
            self.client.collections.delete(collection_name)
            print("Collection deleted")
        else:
            print("Collection does not exist")

    def delete_all(self):
        self.client.collections.delete_all()

    def close(self):
        self.client.close()

In [None]:
# cloud_client = weaviate.connect_to_wcs(
#     cluster_url=wcd_url,
#     auth_credentials=wvc.init.Auth.api_key(wcd_api_key),
#     headers={"X-Cohere-Api-Key": cohere_key}
# )
local_client = weaviate.connect_to_local(
    # skip_init_checks=True,
    # headers={"X-Cohere-Api-Key": cohere_key}
)

In [None]:
db = VectorDB(local_client)

In [None]:
# collection_name = "Coursera"
# collection_name = "Coursera2"
collection_name = "Courses"
# collection_name = "Courses2"
# collection_name = "Test"

In [None]:
# df = pd.read_csv('./Coursera_id.csv', index_col=0)
df = pd.read_excel('./data_cleaning.xlsx', index_col=0)
df.head()

In [None]:
df.rename(columns={'Course Name': 'title',
                   'University': 'university',
                   'Difficulty Level': 'difficulty',
                   'Course Rating': 'rating',
                   'Course URL': 'url',
                   'Course Description': 'description',
                   'Skills': 'skills',
                   }, inplace=True)
df.head()

In [None]:
import random
import string

def generate_unique_id():
    characters = string.ascii_letters + string.digits + '_-'
    return ''.join(random.choice(characters) for _ in range(21))

def generate_unique_ids(n):
    unique_ids = set()
    while len(unique_ids) < n:
        new_id = generate_unique_id()
        if new_id not in unique_ids:
            unique_ids.add(new_id)
    return list(unique_ids)

# Assuming df is your DataFrame and you want to generate IDs for all rows
num_rows = len(df)
df['id'] = generate_unique_ids(num_rows)


In [None]:
df.head()

In [None]:
df.drop_duplicates(['description'], inplace=True, keep='first')
df.head()

In [None]:
df.to_csv('./Coursera_id.csv')

In [None]:
config = dict(
    vectorizer_config=[Configure.NamedVectors.text2vec_transformers(
        name="courses_vector",
        source_properties=["title", "description", 'skills']
    )],
    generative_config=Configure.Generative.cohere(temperature=0)
)

In [None]:
db.delete_collection(collection_name)

In [None]:
db.create_collection(collection_name, **config)

In [None]:
db.client.collections.list_all()

In [None]:
# courses_dict = df.head().to_dict(orient='index')
# courses_dict
courses_dict = df.to_dict(orient='index')

In [None]:
db.insert_data(collection_name, courses_dict, "course_id")

In [None]:
response = db.search_data(collection_name, "html css and js", limit=30)
documents = list()
for obj in response.objects:
    # print(obj.properties['course_id'])
    course_id = obj.properties['course_id']
    title = obj.properties['title']
    description = obj.properties['description'].replace('\n', ' ').strip()
    links = obj.properties['links']
    documents.append({'id': course_id, 'data': {'title': title,
                                                'description': description}})
    print(course_id, ' ' * (30 - len(course_id)), title, ' ' * (30 - len(title)), links)
    # print(obj.properties['course_id'], ' ' * (30 - len(obj.properties['title'])), obj.properties['title'], ' ' * (30 - len(obj.properties['title'])), obj.properties['description'], ' ' * (800 - len(obj.properties['description'])), obj.properties['links'])

In [None]:
response, prompt = db.get_roadmap(collection_name, "frontend with react", "I have no prior knowlgde in react. But I already know HTML, CSS, and JavaScript", limit=30)

In [None]:
objs = list(dict(
    title=obj.properties['title'],
    description=obj.properties['description'],
    skills=obj.properties['skills'],
    difficulty=obj.properties['difficulty'],
    course_id=obj.properties['course_id'],
 ) for obj in response.objects)
objs

In [None]:
print(f'{prompt}\n{objs}')

In [None]:
chat = db.chat(collection_name, "frontend with react", "I know basic html and js I've worked on them multible timesbut I need a some more info on CSS", limit=30)

In [None]:
response = next(chat)

In [None]:
# import json

response = chat.send("I see there is a course with name Writing CSS and id DfrCkbD-HEHwLymv10zb5. why did you not recomand this?")
json_data = json.dumps(response)
with open('_.json', 'w') as file:
    file.write(json_data)


In [None]:
response = db.get_roadmap(collection_name, "cyber securty", "", limit=30)

In [None]:
response

In [None]:
data_dict = json.loads(response.message.content[0].text)
courses = data_dict['courses']
for i in courses:
    course_id = i
    try:
        title = df.loc[i]['title']
    except KeyError:
        print(f'\033[91mKey{i} not found\033[0m')
        continue
    description = df.loc[i]['description'].replace('\n', ' ').strip()
    print(course_id, ' ' * (30 - len(course_id)), title, ' ' * (30 - len(title)), description)

In [None]:
print(response.message.content[0].text)

In [None]:
# print(dir(response.objects[0]))
# print(response.objects[0].collection)
# print(response.objects[0].generated)
# print(response.objects[0].metadata)
# print(response.objects[0].properties)
# print(response.objects[0].references)
# print(response.objects[0].uuid)
# print(response.objects[0].vector)
print(response.generated)


# """I know HTML, CSS, JS, I worked on three projects using those languages."""


In [None]:
data_dict = {
  "courses": [
    "tG5v3O4lNIFc2uCnacPak",
    "PCirR2QiFYO89Fm-Ev3o1",
    "wQSjQqwKHfn5RGPk34BWI",
    "hU5OCnEe3tG206xuGsVFd",
    "aqMaEY8gkKMikiqleV5EP",
    "vpko5Kyf6BZ5MHpxXOKaf",
    "8-lO-v6jCYYoklEJXULxN",
    "z1-eP4sV75GBEIdM4NvL9",
    "ruoFa3M4bUE3Dg6GXSiUI",
    "tObmzWpjsJtK4GWhx6pwB",
    "KMA7NkxFbPoUDtFnGBFnj",
    "yWG2VUkaF5IJVVut6AiSy",
    "ZhSuu2VArnzPDp6dPQQSC",
    "VlNNwIEDWqQXtqkHWJYzC",
    "fekyMpEnaGqjh1Cu4Nyc4",
    "zNFYAJaSq0YZXL5Rpx1NX",
    "z5AdThp9ByulmM9uekgm-",
    "R12sArWVpbIs_PHxBqVaR",
    "hkxw9jPGYphmjhTjw8766",
    "ODcfFEorkfJNupoQygM53",
    "qmTVMJDsEhNIkiwE_UTYu"
  ]
}

In [None]:

# import json

# json_string = '''
# {
#     "courses": [
#         "PCirR2QiFYO89Fm-Ev3o1",
#         "D1IXOBUrrXf5bXhVu9cmI",
#         "aqMaEY8gkKMikiqleV5EP",
#         "P82WFaTPgQEPNp5IIuZ1Y",
#         "fekyMpEnaGqjh1Cu4Nyc4",
#         "rImbMHLLfJwjf3l25vBkc",
#         "jHKCLfLml8oZyj4829gx0",
#         "5iJOE1QxMvf8BQ_8ssiI8",
#         "0NJDgfe6eMa7qPUOI6Eya",
#         "dEsTje8kfHwWjCI3zcgLC"
#     ]
# }
# '''



data_dict = json.loads(response.message.content[0].text)
print(data_dict)


In [None]:
prompt

In [None]:
courses = data_dict['courses']

In [None]:
for i in courses:
    # print(df.loc[i]['title'], ' ' * (60 - len(df.loc[i]['title'])), df.loc[i]['difficulty'], ' ' * (20 - len(df.loc[i]['difficulty'])), df.loc[i]['skills'])
    # print(df.loc[i]['title'], ' ' * (60 - len(df.loc[i]['title'])), df.loc[i]['description'])
    course_id = i
    try:
        title = df.loc[i]['title']
    except KeyError:
        print(f'\033[91mKey{i} not found\033[0m')
        continue
    # difficulty = df.loc[i]['difficulty']
    description = df.loc[i]['description'].replace('\n', ' ').strip()
    # print(course_id, ' ' * (30 - len(course_id)), title, ' ' * (80 - len(title)), difficulty, ' ' * (20 - len(difficulty)), description)
    print(course_id, ' ' * (30 - len(course_id)), title, ' ' * (30 - len(title)), description)
    # print(df.loc[i])

# 1- DO you know CSS? 1, 2, 3, 4, 5 -> 5
# 2- DO you know HTML? 1, 2, 3, 4, 5 -> 5
# 3- DO you know JS? 1, 2, 3, 4, 5 -> 5


In [None]:
for obj in response.objects:
    print(obj.properties['title'])
    print(obj.generated)
    print('-'*100)
    # break

In [None]:
from langchain.vectorstores.weaviate import Weaviate  
from langchain.llms import OpenAI  
from langchain.chains import ChatVectorDBChain  
import weaviate  
  
  
vectorstore = Weaviate("PodClip", "content")  
  
MyOpenAI = OpenAI(temperature=0.2,  
    openai_api_key="sk-key")  
  
qa = ChatVectorDBChain.from_llm(MyOpenAI, vectorstore)  
  
chat_history = []  
  
print("Welcome to the Weaviate ChatVectorDBChain Demo!")  
print("Please enter a question or dialogue to get started!")  
  
while True:  
    query = input("")  
    result = qa({"question": query, "chat_history": chat_history})  
    print(result["answer"])  
    chat_history = [(query, result["answer"])]  
