In [1]:
!pip install openai faiss-cpu numpy groq langchain langchain-community langchain-openai langchain-core langchain-groq streamlit jq



In [1]:
import os
import streamlit as st
import json
import numpy as np
import requests
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import JSONLoader
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_core.prompts import ChatPromptTemplate

In [2]:
from openai import OpenAI
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
import pickle
FAISS_INDEX_PATH = "faiss_index"
PROCESSED_FILES_PATH = "processed_files.pkl"

# Step 1: Load all JSON files from the folder
def load_json_files(folder_path):
    documents = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith(".json"):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                content = json.dumps(data, indent=2)
                doc = Document(page_content=content, metadata={"id": os.path.splitext(file_name)[0]})
                documents.append(doc)
    return documents

# Folder containing the JSON files
folder_path = "final_user_profiles"
documents = load_json_files(folder_path)

In [4]:
def load_vector_store():
    if os.path.exists(FAISS_INDEX_PATH):
        # Enable dangerous deserialization
        return FAISS.load_local(FAISS_INDEX_PATH, OpenAIEmbeddings(model="text-embedding-3-large"), allow_dangerous_deserialization=True)
    else:
        return None

In [5]:
def load_processed_files():
    if os.path.exists(PROCESSED_FILES_PATH):
        try:
            with open(PROCESSED_FILES_PATH, "rb") as f:
                processed_files = pickle.load(f)
                if isinstance(processed_files, set):
                    return processed_files
                else:
                    print("Warning: Processed files data is not a set. Resetting to an empty set.")
                    return set()
        except (EOFError, pickle.UnpicklingError):
            print("Warning: Processed files pickle file is empty or corrupted. Resetting to an empty set.")
            return set()
    else:
        return set()

In [6]:
def save_processed_files(processed_files):
    with open(PROCESSED_FILES_PATH, "wb") as f:
        pickle.dump(processed_files, f)

In [7]:
def update_vector_store(folder_path):
    documents = load_json_files(folder_path)
    processed_files = load_processed_files()
    vector_store = load_vector_store()
    
    if vector_store is None:
        # Initialize a new vector store if it doesn't exist
        vector_store = FAISS.from_documents(documents, OpenAIEmbeddings(model="text-embedding-3-large"))
    
    # Identify new files
    new_documents = [doc for doc in documents if doc.metadata["id"] not in processed_files]
    
    if new_documents:
        print(f"Embedding {len(new_documents)} new files...")
        vector_store.add_documents(new_documents)
        
        # Update processed files list
        processed_files.update(doc.metadata["id"] for doc in new_documents)
        save_processed_files(processed_files)
        
        # Save the updated vector store
        vector_store.save_local("faiss_index")
    else:
        print("No new files to embed.")

    return vector_store

In [8]:
def find_similar_profiles(profile_id, k=5, folder_path="final_user_profiles"):
    vector_store = update_vector_store(folder_path)
    processed_files = load_processed_files()
    
    if profile_id not in processed_files:
        raise ValueError(f"Profile with ID '{profile_id}' not found in the embedded data.")
    
    # Retrieve the input document
    documents = load_json_files(folder_path)
    query_doc = next((doc for doc in documents if doc.metadata["id"] == profile_id), None)
    if not query_doc:
        raise ValueError(f"Profile with ID '{profile_id}' not found in JSON files.")
    
    # Retrieve similar profiles
    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": k + 1})
    similar_docs = retriever.get_relevant_documents(query_doc.page_content)
    
    # Filter out the input profile itself
    similar_ids = [doc.metadata["id"] for doc in similar_docs if doc.metadata["id"] != profile_id][:k]
    return similar_ids

In [12]:
folder_path = "final_user_profiles"  # Update with your folder path
input_profile_id = "ankurc07"  # Replace with your profile ID
top_k = 15  # Number of similar profiles to retrieve

similar_profiles = find_similar_profiles(input_profile_id, k=top_k, folder_path=folder_path)
print(f"Top {top_k} similar profiles for ID '{input_profile_id}': {similar_profiles}")

Embedding 1 new files...
Top 15 similar profiles for ID 'ankurc07': ['aaditya-sanjay-b-a62630a0', 'aashita-jindal', 'diksha-mittal', 'aaryan-jaiswal', 'aastha-r', 'dhawal-barchha-2a9b031b6', 'dipramit-pal', 'dishansraao', 'aamir-wahid-723ab8a5', 'aaryan-sood-b4148216b', 'dipanki-mukherjee-pabreja-8a0198204', 'aastha-agarwal-iitiim', 'dibyangshu-sahoo-26038272', 'dhruv-singhal-709795118']
