In [None]:
# Azure ML Notebook: Load .docx Documents, Generate Embeddings & Store in PostgreSQL

# --- Cell 1: Install Required Libraries ---
# This installs the packages needed to read Word documents, connect to PostgreSQL, and call Azure OpenAI embeddings.
!pip install python-docx openai psycopg2-binary numpy



In [None]:
# --- Cell 2: Import Required Libraries ---
# Import necessary modules for handling documents, APIs, databases, and arrays.
from docx import Document
import openai
import psycopg2
import numpy as np
import os



In [None]:
# --- Cell 3: Helper Function to Extract Text from .docx Files ---
# This function reads and extracts all paragraphs from a .docx file into plain text.
def extract_text_from_docx(file_path):
    doc = Document(file_path)
    return "\n".join([para.text for para in doc.paragraphs])



In [None]:
# --- Cell 4: Helper Function to Generate Embeddings ---
# This function takes text and sends it to Azure OpenAI to generate a vector embedding.
def get_embedding(text):
    response = openai.Embedding.create(
        input=text,
        engine="text-embedding-ada-002"
    )
    return response['data'][0]['embedding']



In [None]:
# --- Cell 5: Set Up Azure OpenAI API Configuration ---
# Replace placeholders below with your actual Azure OpenAI credentials.
openai.api_key = "<YOUR_AZURE_OPENAI_KEY>"
openai.api_base = "https://<YOUR-RESOURCE-NAME>.openai.azure.com/"
openai.api_type = "azure"
openai.api_version = "2023-03-15-preview"



In [None]:
# --- Cell 6: Load Word Documents ---
# Add your uploaded .docx filenames below. This code will load and extract content.
files = [
    "Fleet maintenance best practices across industries.docx",
    "Top 10 tips on Fleet Preventive Maintenance.docx",
    "Fleet vehicle maintenance A comprehensive guide.docx",
    "Fleet_Maintenance_Tips.docx"
]

# Store the content of each document in a dictionary
documents = {file: extract_text_from_docx(file) for file in files}



In [None]:
# --- Cell 7: Generate Embeddings for Each Document ---
# Converts the text from each document to a 1536-dimension vector using Azure OpenAI.
embeddings = {filename: get_embedding(text) for filename, text in documents.items()}



In [None]:
# --- Cell 8: Connect to PostgreSQL Flexible Server ---
# Connect securely to your Azure PostgreSQL DB by providing credentials.
conn = psycopg2.connect(
    dbname="<YOUR_DB_NAME>",
    user="<YOUR_USERNAME>",
    password="<YOUR_PASSWORD>",
    host="<YOUR_POSTGRESQL_HOST>",
    port="5432"  # Default port unless changed
)
cur = conn.cursor()



In [None]:
# --- Cell 9: Create Table for Storing Documents and Embeddings ---
# This creates the table with a vector column using pgvector extension (must be enabled).
cur.execute("""
CREATE EXTENSION IF NOT EXISTS vector;

CREATE TABLE IF NOT EXISTS maintenance_documents (
    id SERIAL PRIMARY KEY,
    filename TEXT NOT NULL,
    content TEXT NOT NULL,
    embeddings vector(1536)
);
""")
conn.commit()



# --- Cell 10: Insert Document Content and Embeddings into the Table ---

In [None]:

# Store each document and its embedding into the PostgreSQL table.
for filename, text in documents.items():
    embedding_vector = np.array(embeddings[filename]).tolist()
    cur.execute(
        "INSERT INTO maintenance_documents (filename, content, embeddings) VALUES (%s, %s, %s)",
        (filename, text, embedding_vector)
    )

conn.commit()
cur.close()
conn.close()

print("✅ All documents uploaded and embeddings stored successfully in PostgreSQL!")
