In [25]:
import os
from dotenv import load_dotenv
from sqlalchemy import make_url
from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore
import pandas as pd
import uuid
from typing import Dict, List
from sqlalchemy import make_url
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.vector_stores.postgres import PGVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from tqdm import tqdm
from llama_index.core.node_parser import SimpleNodeParser

In [26]:
# load environment variables
load_dotenv()

# may need to change this to a different name
openai_api_key = os.getenv("OPENAI_API_KEY")
connection_string = os.getenv("DB_CONNECTION") 

url = make_url(connection_string)

In [27]:
def generate_unique_id(name: str, school: str) -> str:
    """Generate a unique ID based on name and school"""
    return str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{name}-{school}"))

def create_embedding_text(row: pd.Series) -> str:
    """Create a concatenated text string for embedding"""
    # Start with required fields using column names
    text_parts = [
        f"Major: {row['Major(s) and Minor(s)']}", 
        f"Degree: {row['Degree(s) Earned']}", 
        f"Job Title: {row['Current Job Title']}", 
        f"Industry: {row['Industry']}"
    ]
    
    # Add comments without column name if they exist
    if not pd.isna(row['Comment']):
        text_parts.append(str(row['Comment']))
        
    return "; ".join(text_parts)

def prepare_metadata(row: pd.Series) -> Dict:
    """Create metadata dictionary for each record"""
    metadata = {
        "grad_year": int(row["Graduation Year"]),
        "degree": row["Degree(s) Earned"],
        "major": row["Major(s) and Minor(s)"],
        "job_title": row["Current Job Title"],
        "industry": row["Industry"],
        "school": row["School"],
        "comments": None if pd.isna(row['Comment']) else row['Comment'],
        "name": row["Name/Identifier"],
        "source_identifier": f"{row['Name/Identifier']}-{row['School']}"
    }
    return metadata

def process_alumni_data(df: pd.DataFrame) -> List[Document]:
    """Process alumni data and return LlamaIndex documents"""
    documents = []
    
    for _, row in tqdm(df.iterrows(), total=len(df)):
        original_id = generate_unique_id(row["Name/Identifier"], row["School"])
        
        metadata = prepare_metadata(row)
        metadata["original_id"] = original_id
        
        doc = Document(
            text=create_embedding_text(row),
            metadata=metadata
        )
        documents.append(doc)
    
    return documents


In [28]:
# Process the data
df_raw_1 = pd.read_excel("..\\data\\raw_alumini_data_billy.xlsx")
documents_raw_1 = process_alumni_data(df_raw_1)

100%|██████████| 51/51 [00:00<00:00, 10198.79it/s]


In [29]:
parser = SimpleNodeParser()
nodes = parser.get_nodes_from_documents(documents_raw_1)
print(f'{len(nodes)} nodes created')

51 nodes created


In [30]:

# Create embedding model
embedding_model = OpenAIEmbedding(model="text-embedding-3-large")

# Set up database connection and vector store
url = make_url(connection_string)
vector_store = PGVectorStore.from_params(
    database='ai_advising_db',
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name="alumni_records",
    embed_dim=3072,  # openai embedding dimension
    hybrid_search=True,  # Enable hybrid search
    text_search_config="english",
)

# Create storage context and index
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [31]:
documents_raw_1[0]

Document(id_='da18da8d-79b7-4e3e-bd21-83b4c4235ac9', embedding=None, metadata={'grad_year': 2012, 'degree': 'M.S', 'major': 'Financial Engineering', 'job_title': 'Engineering Leader', 'industry': 'Entertainment Providers', 'school': 'CGU', 'comments': None, 'name': 'Yunzhi Z.', 'source_identifier': 'Yunzhi Z.-CGU', 'original_id': 'bbec7767-7721-5682-99e6-463bfdaf93b4'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='Major: Financial Engineering; Degree: M.S; Job Title: Engineering Leader; Industry: Entertainment Providers', mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [32]:
nodes[0]

TextNode(id_='1670351c-854f-4e13-90da-f4d702919173', embedding=None, metadata={'grad_year': 2012, 'degree': 'M.S', 'major': 'Financial Engineering', 'job_title': 'Engineering Leader', 'industry': 'Entertainment Providers', 'school': 'CGU', 'comments': None, 'name': 'Yunzhi Z.', 'source_identifier': 'Yunzhi Z.-CGU', 'original_id': 'bbec7767-7721-5682-99e6-463bfdaf93b4'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='da18da8d-79b7-4e3e-bd21-83b4c4235ac9', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'grad_year': 2012, 'degree': 'M.S', 'major': 'Financial Engineering', 'job_title': 'Engineering Leader', 'industry': 'Entertainment Providers', 'school': 'CGU', 'comments': None, 'name': 'Yunzhi Z.', 'source_identifier': 'Yunzhi Z.-CGU', 'original_id': 'bbec7767-7721-5682-99e6-463bfdaf93b4'}, hash='162b5e64f1babd24d9dbb85d5c6c0f066b16be1ab9590518fd26ee3ef1ba5217')}, text='Major: Financial Engineering;

In [33]:
# Create and store index
index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    embed_model=embedding_model,
    show_progress=True
)

Generating embeddings: 100%|██████████| 51/51 [00:01<00:00, 48.30it/s]
