In this notebook, we prepare ALMA to recommend helpful video clips by uploading timestamped YouTube segments into a Pinecone vector database. Each video chunk is semantically indexed using OpenAI embeddings, so ALMA can later retrieve and suggest the most relevant clips during conversations. This forms the foundation of ALMA’s ability to offer users meaningful, bite-sized video content — but only when it truly supports the topic being discussed

In [None]:
import os
import json
import glob
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document 
from pinecone import Pinecone, ServerlessSpec

# === Load environment ===
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# === Init Embeddings ===
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

# === Init Pinecone & Create Index if Needed ===
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "alma-video-index"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print(f"✅ Created index: {index_name}")
else:
    print(f"ℹ️ Index '{index_name}' already exists.")

# === Time Converter ===
def to_seconds(ts):
    parts = ts.split(":")
    parts = [int(p) for p in parts]
    if len(parts) == 3:
        h, m, s = parts
    elif len(parts) == 2:
        h, m, s = 0, parts[0], parts[1]
    elif len(parts) == 1:
        h, m, s = 0, 0, parts[0]
    else:
        raise ValueError(f"Unrecognized time format: {ts}")
    return h * 3600 + m * 60 + s

# === Load All JSON Video Segments ===
def load_video_chunks_from_json(folder_path):
    all_docs = []
    json_files = glob.glob(os.path.join(folder_path, "*.json"))

    print(f"🗂 Found {len(json_files)} JSON files")

    for file_path in json_files:
        with open(file_path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
            except Exception as e:
                print(f"❌ Failed to load {file_path}: {e}")
                continue

        print(f"📄 Processing '{os.path.basename(file_path)}' with {len(data)} segments...")

        for entry in data:
            text = entry.get("text", "").strip()
            if not text:
                continue  # skip empty

            try:
                video_id = entry["video_id"]
                title = entry["video_title"]
                start_time = entry["start_time"]
                seconds = to_seconds(start_time)

                metadata = {
                    "video_title": title,
                    "video_id": video_id,
                    "topic": entry.get("topic", ""),
                    "start_time": start_time,
                    "end_time": entry.get("end_time", ""),
                    "tags": entry.get("tags", []),
                    "video_url": f"https://www.youtube.com/watch?v={video_id}&t={seconds}s"
                }

                doc = Document(page_content=text, metadata=metadata)
                all_docs.append(doc)

            except Exception as e:
                print(f"⚠️ Skipping segment due to error: {e}")

    return all_docs

# === Set Your Folder Path Here ===
folder_path = r"C:\Users\alvar\OneDrive\文件\Iron Hack\ALMA-Chatbot\TaggedChunks"
video_docs = load_video_chunks_from_json(folder_path)

print(f"📦 Loaded {len(video_docs)} segments. Uploading...")

# === Upload to Pinecone ===
vectorstore = PineconeVectorStore.from_documents(
    documents=video_docs,
    embedding=embeddings,
    index_name=index_name,
)

print(f"✅ Uploaded {len(video_docs)} video segments to '{index_name}'")


ℹ️ Index 'alma-video-index' already exists.
🗂 Found 6 JSON files
📄 Processing '4_Small_Habits_mMHNvy9pFj0_tagged.json' with 7 segments...
📄 Processing 'Foods_Control_Our_Moods_Q4qWzbP0q7I_tagged.json' with 11 segments...
📄 Processing 'Master_Your_Sleep_lIo9FcrljDk_tagged.json' with 8 segments...
📄 Processing 'Powerful_Happiness_Hacks_14-DJFPm1_4_tagged.json' with 15 segments...
📄 Processing 'Young_Forever_gO_x3gnXBzg_tagged.json' with 9 segments...
📄 Processing 'Your_Diet_is_Changing_Your_Brain_NbymuYEEqlE_tagged.json' with 8 segments...
📦 Loaded 58 segments. Uploading...
✅ Uploaded 58 video segments to 'alma-video-index'
