## **Extraction of the transcripts**



##### Imports.

In [1]:
import os
import re
import json
import openai
import tiktoken
import pinecone

from openai import OpenAI
from uuid import uuid4
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv, find_dotenv
from sentence_transformers import SentenceTransformer
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone as LangchainPinecone
from langchain_pinecone import PineconeVectorStore






##### In this first part we are going to colect our data from the Youtube videos. We get the transcripts and name them with the video title. 

In [2]:
# List of videos
video_data = [
    {"url": "https://www.youtube.com/watch?v=lIo9FcrljDk", "Name":"Master Your Sleep"},
    {"url": "https://www.youtube.com/watch?v=NbymuYEEqlE", "Name":"Your Diet is Changing Your Brain"},
    {"url": "https://www.youtube.com/watch?v=Q4qWzbP0q7I", "Name":"Foods Control Our Moods"},
    {"url": "https://www.youtube.com/watch?v=mMHNvy9pFj0", "Name": "4 Small Habits"},
    {"url": "https://www.youtube.com/watch?v=gO_x3gnXBzg", "Name": "Young Forever"},
    {"url": "https://www.youtube.com/watch?v=14-DJFPm1_4", "Name": "Powerful Happiness Hacks"},
]

# extract video ID from YouTube URL
def extract_video_id(url):
    match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
    return match.group(1) if match else None

# clean transcript text
def clean_text(transcript):
    return " ".join([entry['text'] for entry in transcript]).replace("\n", " ").strip()

output_folder = "transcripts"
os.makedirs(output_folder, exist_ok=True)

# save transcripts to text files
for video in video_data:
    video_id = extract_video_id(video['url'])
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en']) 
        cleaned = clean_text(transcript)

        filename = f"{video['Name'].replace(' ', '_')}_{video_id}.txt"
        file_path = os.path.join(output_folder, filename)

        with open(file_path, "w", encoding="utf-8") as f:
            f.write(cleaned)

        print(f" Transcript saved: {file_path}")
    except Exception as e:
        print(f" Error with {video['url']}: {e}")

 Transcript saved: transcripts\Master_Your_Sleep_lIo9FcrljDk.txt
 Transcript saved: transcripts\Your_Diet_is_Changing_Your_Brain_NbymuYEEqlE.txt
 Transcript saved: transcripts\Foods_Control_Our_Moods_Q4qWzbP0q7I.txt
 Transcript saved: transcripts\4_Small_Habits_mMHNvy9pFj0.txt
 Transcript saved: transcripts\Young_Forever_gO_x3gnXBzg.txt
 Transcript saved: transcripts\Powerful_Happiness_Hacks_14-DJFPm1_4.txt


##### Now, that we have the transcripts. We need to **chunk** the text to get more friemndly embedings for our vector store.
We use the timestamps of the videos so we get meaningful chunks, each one already tied to a specific topic.
This is  more accurate, and much easier to label, search, and structure.

First video: Master your sleep

In [3]:
# Helper Functions ===

def extract_video_id(url):
    match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
    return match.group(1) if match else None

def hms_to_seconds(hms: str) -> float:
    h, m, s = map(int, hms.split(':'))
    return h * 3600 + m * 60 + s

# Video Info and Timestamp Segments 

video = {
    "url": "https://www.youtube.com/watch?v=lIo9FcrljDk",
    "Name": "Master Your Sleep"
}
video_id = extract_video_id(video["url"])
video_title = video["Name"]

timestamp_segments = [
    {"start": "00:00:00", "end": "00:01:11", "topic": "Introduction to Sleep & Wakefulness"},
    {"start": "00:01:11", "end": "00:03:30", "topic": "The Science of Sleep: Adenosine Explained"},
    {"start": "00:03:30", "end": "00:05:08", "topic": "Circadian Rhythms: The Body’s Internal Clock"},
    {"start": "00:05:08", "end": "00:10:16", "topic": "The Role of Cortisol & Melatonin"},
    {"start": "00:10:16", "end": "00:14:12", "topic": "Maximizing Morning Light Exposure"},
    {"start": "00:14:12", "end": "00:16:08", "topic": "Other Factors Influencing Circadian Rhythms"},
    {"start": "00:16:08", "end": "00:24:31", "topic": "The Impact of Light on Sleep Quality"},
    {"start": "00:24:31", "end": "00:28:00", "topic": "Napping & Non-Sleep Deep Rest"},
]

# Convert to seconds
for segment in timestamp_segments:
    segment["start_sec"] = hms_to_seconds(segment["start"])
    segment["end_sec"] = hms_to_seconds(segment["end"])

# === Step 3: Download Transcript ===

try:
    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
except Exception as e:
    print(f" Error fetching transcript: {e}")
    transcript = []

# === Step 4: Chunking ===

chunked_output = []

for segment in timestamp_segments:
    chunk_lines = [
        line["text"] for line in transcript
        if segment["start_sec"] <= line["start"] < segment["end_sec"]
    ]
    full_text = " ".join(chunk_lines).strip()

    if full_text:
        chunked_output.append({
            "video_id": video_id,
            "video_title": video_title,
            "topic": segment["topic"],
            "start_time": segment["start"],
            "end_time": segment["end"],
            "text": full_text
        })

# === Step 5: Save JSON ===


output_path = os.path.abspath(os.path.join("..", "Chunks", f"{video_title.replace(' ', '_')}_{video_id}.json"))

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(chunked_output, f, indent=2, ensure_ascii=False)

print(f" Segmented chunks saved to: {output_path}")


 Segmented chunks saved to: c:\Users\alvar\OneDrive\文件\Iron Hack\ALMA-Chatbot\Chunks\Master_Your_Sleep_lIo9FcrljDk.json


Second video: Your Diet is Changing Your Brain

In [4]:
# Helper Functions ===

def extract_video_id(url):
    match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
    return match.group(1) if match else None

def hms_to_seconds(hms: str) -> float:
    parts = list(map(int, hms.split(':')))
    if len(parts) == 2:  # MM:SS format
        m, s = parts
        return m * 60 + s
    elif len(parts) == 3:  # HH:MM:SS format
        h, m, s = parts
        return h * 3600 + m * 60 + s
    else:
        raise ValueError(f"Invalid time format: {hms}")

# Video Info and Timestamp Segments 

video = {
    "url": "https://www.youtube.com/watch?v=NbymuYEEqlE",
    "Name": "Your Diet is Changing Your Brain"
}
video_id = extract_video_id(video["url"])
video_title = video["Name"]

timestamp_segments_2 = [
    {"start": "00:00", "end": "00:30", "topic": "Ultra Processed Foods & Health"},
    {"start": "00:30", "end": "01:19", "topic": "Mental Health Statistics"},
    {"start": "01:19", "end": "01:52", "topic": "Examples of UPFs"},
    {"start": "01:52", "end": "03:59", "topic": "Public Health Change"},
    {"start": "03:59", "end": "04:47", "topic": "Challenges in Changing Public Behavior"},
    {"start": "04:47", "end": "07:52", "topic": "Industry Influence & Misinformation"},
    {"start": "07:52", "end": "09:22", "topic": "The Need for Systemic Change"},
    {"start": "09:22", "end": "10:30", "topic": "Potential Solutions"},
]

# Convert to seconds
for segment in timestamp_segments_2:
    segment["start_sec"] = hms_to_seconds(segment["start"])
    segment["end_sec"] = hms_to_seconds(segment["end"])

# === Step 3: Download Transcript ===

try:
    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
except Exception as e:
    print(f" Error fetching transcript: {e}")
    transcript = []

# === Step 4: Chunking ===

chunked_output = []

for segment in timestamp_segments_2:
    chunk_lines = [
        line["text"] for line in transcript
        if segment["start_sec"] <= line["start"] < segment["end_sec"]
    ]
    full_text = " ".join(chunk_lines).strip()

    if full_text:
        chunked_output.append({
            "video_id": video_id,
            "video_title": video_title,
            "topic": segment["topic"],
            "start_time": segment["start"],
            "end_time": segment["end"],
            "text": full_text
        })

# === Step 5: Save JSON ===


output_path = os.path.abspath(os.path.join("..", "Chunks", f"{video_title.replace(' ', '_')}_{video_id}.json"))

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(chunked_output, f, indent=2, ensure_ascii=False)

print(f" Segmented chunks saved to: {output_path}")

 Segmented chunks saved to: c:\Users\alvar\OneDrive\文件\Iron Hack\ALMA-Chatbot\Chunks\Your_Diet_is_Changing_Your_Brain_NbymuYEEqlE.json


Third video: Foods Control Our Moods

In [5]:
# Helper Functions ===

def extract_video_id(url):
    match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
    return match.group(1) if match else None

def hms_to_seconds(hms: str) -> float:
    h, m, s = map(int, hms.split(':'))
    return h * 3600 + m * 60 + s

# Video Info and Timestamp Segments 

video = {
    "url": "https://www.youtube.com/watch?v=Q4qWzbP0q7I",
    "Name": "Foods Control Our Moods"
}
video_id = extract_video_id(video["url"])
video_title = video["Name"]


# Timestamps and topics for "Foods Control Our Moods"
timestamp_segments_3 = [
    {"start": "00:00:00", "end": "00:02:30", "topic": "Huberman Lab Essentials; Emotions, Food & Nutrition"},
    {"start": "00:02:30", "end": "00:03:38", "topic": "Attraction & Aversion"},
    {"start": "00:03:38", "end": "00:06:31", "topic": "Vagus Nerve, Sugar"},
    {"start": "00:06:31", "end": "00:08:54", "topic": "Gut Feelings, Hidden Sugars, Amino Acids"},
    {"start": "00:08:54", "end": "00:12:57", "topic": "Dopamine, Craving, L-tyrosine"},
    {"start": "00:12:57", "end": "00:16:12", "topic": "Serotonin, Carbohydrates"},
    {"start": "00:16:12", "end": "00:19:12", "topic": "Omega-3s, Depression, SSRIs"},
    {"start": "00:19:12", "end": "00:22:35", "topic": "Gut-Brain Axis, Gut Microbiome"},
    {"start": "00:22:35", "end": "00:25:39", "topic": "Probiotics, Brain Fog, Fermented Foods, Saccharine Caution"},
    {"start": "00:25:39", "end": "00:28:59", "topic": "Ketogenic Diet, Gut Microbiome, Diet Variability"},
    {"start": "00:28:59", "end": "00:32:00", "topic": "Belief Effects & Key Takeaways"},
]

# Convert to seconds
for segment in timestamp_segments_3:
    segment["start_sec"] = hms_to_seconds(segment["start"])
    segment["end_sec"] = hms_to_seconds(segment["end"])

# === Step 3: Download Transcript ===

try:
    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
except Exception as e:
    print(f" Error fetching transcript: {e}")
    transcript = []

# === Step 4: Chunking ===

chunked_output = []

for segment in timestamp_segments_3:
    chunk_lines = [
        line["text"] for line in transcript
        if segment["start_sec"] <= line["start"] < segment["end_sec"]
    ]
    full_text = " ".join(chunk_lines).strip()

    if full_text:
        chunked_output.append({
            "video_id": video_id,
            "video_title": video_title,
            "topic": segment["topic"],
            "start_time": segment["start"],
            "end_time": segment["end"],
            "text": full_text
        })

# === Step 5: Save JSON ===


output_path = os.path.abspath(os.path.join("..", "Chunks", f"{video_title.replace(' ', '_')}_{video_id}.json"))

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(chunked_output, f, indent=2, ensure_ascii=False)

print(f" Segmented chunks saved to: {output_path}")


 Segmented chunks saved to: c:\Users\alvar\OneDrive\文件\Iron Hack\ALMA-Chatbot\Chunks\Foods_Control_Our_Moods_Q4qWzbP0q7I.json


Fourth video: 4 Small Habits 

In [6]:
# Helper Functions ===

def extract_video_id(url):
    match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
    return match.group(1) if match else None

def hms_to_seconds(hms: str) -> float:
    parts = list(map(int, hms.split(':')))
    if len(parts) == 2:  # MM:SS format
        m, s = parts
        return m * 60 + s
    elif len(parts) == 3:  # HH:MM:SS format
        h, m, s = parts
        return h * 3600 + m * 60 + s
    else:
        raise ValueError(f"Invalid time format: {hms}")
    
# Video Info and Timestamp Segments 

video = {
    "url": "https://www.youtube.com/watch?v=mMHNvy9pFj0",
    "Name": "4 Small Habits"
}
video_id = extract_video_id(video["url"])
video_title = video["Name"]

# Define the timestamp segments for "4 Small Habits"
timestamp_segments_4 = [
    {"start": "00:00", "end": "03:49", "topic": "Introduction"},
    {"start": "03:49", "end": "14:54", "topic": "Dr. Chatterjee’s Four Pillars of Health"},
    {"start": "14:54", "end": "42:09", "topic": "The First Pillar: Food"},
    {"start": "42:09", "end": "56:53", "topic": "The Second Pillar: Movement"},
    {"start": "56:53", "end": "1:05:55", "topic": "The Third Pillar: Sleep"},
    {"start": "1:05:55", "end": "01:19:16", "topic": "The Fourth Pillar: Relaxation"},
    {"start": "01:19:16", "end": "01:30:00", "topic": "Managing Stress as a Caregiver"},  
]

# Convert to seconds
for segment in timestamp_segments_4:
    segment["start_sec"] = hms_to_seconds(segment["start"])
    segment["end_sec"] = hms_to_seconds(segment["end"])

# === Step 3: Download Transcript ===

try:
    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
except Exception as e:
    print(f" Error fetching transcript: {e}")
    transcript = []

# === Step 4: Chunking ===

chunked_output = []

for segment in timestamp_segments_4:
    chunk_lines = [
        line["text"] for line in transcript
        if segment["start_sec"] <= line["start"] < segment["end_sec"]
    ]
    full_text = " ".join(chunk_lines).strip()

    if full_text:
        chunked_output.append({
            "video_id": video_id,
            "video_title": video_title,
            "topic": segment["topic"],
            "start_time": segment["start"],
            "end_time": segment["end"],
            "text": full_text
        })

# === Step 5: Save JSON ===


output_path = os.path.abspath(os.path.join("..", "Chunks", f"{video_title.replace(' ', '_')}_{video_id}.json"))

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(chunked_output, f, indent=2, ensure_ascii=False)

print(f" Segmented chunks saved to: {output_path}")

 Segmented chunks saved to: c:\Users\alvar\OneDrive\文件\Iron Hack\ALMA-Chatbot\Chunks\4_Small_Habits_mMHNvy9pFj0.json



Fith video: Young Forever 


In [7]:
# Helper Functions ===

def extract_video_id(url):
    match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
    return match.group(1) if match else None

def hms_to_seconds(hms: str) -> float:
    parts = list(map(int, hms.split(':')))
    if len(parts) == 2:  # MM:SS format
        m, s = parts
        return m * 60 + s
    elif len(parts) == 3:  # HH:MM:SS format
        h, m, s = parts
        return h * 3600 + m * 60 + s
    else:
        raise ValueError(f"Invalid time format: {hms}")
    
# Video Info and Timestamp Segments 

video = {
    "url": "https://www.youtube.com/watch?v=gO_x3gnXBzg",
    "Name": "Young Forever"
}
video_id = extract_video_id(video["url"])
video_title = video["Name"]

# Define the timestamp segments for "Young Forever"
timestamp_segments_5 = [
    {"start": "00:00", "end": "05:21", "topic": "Welcome"},
    {"start": "05:21", "end": "13:34", "topic": "Changing the Conversation About Aging"},
    {"start": "13:34", "end": "24:05", "topic": "Dr. Vonda’s Journey From Cancer Nurse to Orthopedic Surgeon"},
    {"start": "24:05", "end": "34:41", "topic": "The Incredible Power of Mobility on Your Health"},
    {"start": "34:41", "end": "39:54", "topic": "How You Age Is In Your Control"},
    {"start": "39:54", "end": "48:35", "topic": "Investing in Your Future Mobility"},
    {"start": "48:35", "end": "1:03:59", "topic": "How to Start Your Fitness Journey: The FACE Acronym for Midlife Exercise"},
    {"start": "1:03:59", "end": "1:08:00", "topic": "Debunking Myths About Joint Health"},
    {"start": "1:08:00", "end": "1:20:00", "topic": "Addressing Arthritis Holistically"}  
]

# Convert to seconds
for segment in timestamp_segments_5:
    segment["start_sec"] = hms_to_seconds(segment["start"])
    segment["end_sec"] = hms_to_seconds(segment["end"])

# === Step 3: Download Transcript ===

try:
    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
except Exception as e:
    print(f" Error fetching transcript: {e}")
    transcript = []

# === Step 4: Chunking ===

chunked_output = []

for segment in timestamp_segments_5:
    chunk_lines = [
        line["text"] for line in transcript
        if segment["start_sec"] <= line["start"] < segment["end_sec"]
    ]
    full_text = " ".join(chunk_lines).strip()

    if full_text:
        chunked_output.append({
            "video_id": video_id,
            "video_title": video_title,
            "topic": segment["topic"],
            "start_time": segment["start"],
            "end_time": segment["end"],
            "text": full_text
        })

# === Step 5: Save JSON ===


output_path = os.path.abspath(os.path.join("..", "Chunks", f"{video_title.replace(' ', '_')}_{video_id}.json"))

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(chunked_output, f, indent=2, ensure_ascii=False)

print(f" Segmented chunks saved to: {output_path}")


 Segmented chunks saved to: c:\Users\alvar\OneDrive\文件\Iron Hack\ALMA-Chatbot\Chunks\Young_Forever_gO_x3gnXBzg.json


Sixth video: Powerful Happiness Hacks

In [8]:
# Helper Functions ===

def extract_video_id(url):
    match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
    return match.group(1) if match else None

def hms_to_seconds(hms: str) -> float:
    parts = list(map(int, hms.split(':')))
    if len(parts) == 2:  # MM:SS format
        m, s = parts
        return m * 60 + s
    elif len(parts) == 3:  # HH:MM:SS format
        h, m, s = parts
        return h * 3600 + m * 60 + s
    else:
        raise ValueError(f"Invalid time format: {hms}")
    
# Video Info and Timestamp Segments 

video = {
    "url": "https://www.youtube.com/watch?v=14-DJFPm1_4",
    "Name": "Powerful Happiness Hacks"
}
video_id = extract_video_id(video["url"])
video_title = video["Name"]

# Define the timestamp segments for "Powerful Happiness Hacks"
timestamp_segments_6 = [
    {"start": "00:00", "end": "06:55", "topic": "Introduction"},
    {"start": "06:55", "end": "14:21", "topic": "Yale’s Course on Happiness"},
    {"start": "14:21", "end": "18:29", "topic": "Common Misconceptions About Happiness"},
    {"start": "18:29", "end": "24:00", "topic": "Survival vs Thriving – Rewiring Happiness"},
    {"start": "24:00", "end": "25:46", "topic": "Why Changing Circumstances Isn’t Enough"},
    {"start": "25:46", "end": "34:01", "topic": "Money & Happiness"},
    {"start": "34:01", "end": "40:08", "topic": "Spending Free Time to Feel Better"},
    {"start": "40:08", "end": "44:06", "topic": "Slowing Down & Kindness"},
    {"start": "44:06", "end": "49:01", "topic": "Happiness for Introverts & Extroverts"},
    {"start": "49:01", "end": "54:37", "topic": "Helping Others & Joy"},
    {"start": "54:37", "end": "1:08:48", "topic": "Finding Joy in Hard Moments"},
    {"start": "1:08:48", "end": "1:10:49", "topic": "Everyday Habits for Instant Happiness"},
    {"start": "1:10:49", "end": "1:16:44", "topic": "Mastering Self-Compassion"},
    {"start": "1:16:44", "end": "1:20:08", "topic": "Happiness & a Better World"},
    {"start": "1:20:08", "end": "1:30:00", "topic": "Happiness Homework & Final Thoughts"}  
]

# Convert to seconds
for segment in timestamp_segments_6:
    segment["start_sec"] = hms_to_seconds(segment["start"])
    segment["end_sec"] = hms_to_seconds(segment["end"])

# === Step 3: Download Transcript ===

try:
    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
except Exception as e:
    print(f" Error fetching transcript: {e}")
    transcript = []

# === Step 4: Chunking ===

chunked_output = []

for segment in timestamp_segments_6:
    chunk_lines = [
        line["text"] for line in transcript
        if segment["start_sec"] <= line["start"] < segment["end_sec"]
    ]
    full_text = " ".join(chunk_lines).strip()

    if full_text:
        chunked_output.append({
            "video_id": video_id,
            "video_title": video_title,
            "topic": segment["topic"],
            "start_time": segment["start"],
            "end_time": segment["end"],
            "text": full_text
        })

# === Step 5: Save JSON ===


output_path = os.path.abspath(os.path.join("..", "Chunks", f"{video_title.replace(' ', '_')}_{video_id}.json"))

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(chunked_output, f, indent=2, ensure_ascii=False)

print(f" Segmented chunks saved to: {output_path}")


 Segmented chunks saved to: c:\Users\alvar\OneDrive\文件\Iron Hack\ALMA-Chatbot\Chunks\Powerful_Happiness_Hacks_14-DJFPm1_4.json


##### We enrich the chunks with OpenAI
Send each chunk to GPT to assign more semantic tags.

In [14]:
# Load API key
load_dotenv(find_dotenv())
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

# Initialize OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

# Your taxonomy of allowed tags
TAXONOMY = [
    "sleep", "nutrition", "gut health", "mental health", "longevity",
    "inflammation", "movement", "relaxation", "aging", "happiness",
    "supplements", "diet", "stress", "preventive care", "food",
     "exercise",  "health", "mindfulness", "emotional health",
    "Circadian Rhythms", "Ultra Processed Foods", "cancer"
]

# Define the path to the Chunks folder outside the Notebooks folder
base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))  
CHUNKS_FOLDER = os.path.join(base_dir, "Chunks")  
OUTPUT_FOLDER = os.path.join(base_dir, "TaggedChunks")  
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Function to get tags from GPT using new API
def tag_chunk(text, taxonomy):
    prompt = f"""
You are a health and wellness expert with deep knowledge in nutrition, mental and physical health, and healthy aging. 
You specialize in helping people improve their sleep, nutrition, and overall well-being.
From the following text, extract only 1–3 most relevant topics from this list:

{taxonomy}

Text:
\"\"\"
{text}
\"\"\"

Respond only with a Python list of keywords, like: ["sleep", "nutrition"]
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a professional health tagger for AI content."},
                {"role": "user", "content": prompt}
            ]
        )
        return eval(response.choices[0].message.content)
    except Exception as e:
        print(" Error tagging chunk:", e)
        return []

# Loop over all JSON files in the Chunks folder
for filename in os.listdir(CHUNKS_FOLDER):
    if filename.endswith(".json"):
        print(f" Processing {filename}")
        path = os.path.join(CHUNKS_FOLDER, filename)
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)

        for chunk in data:
            tags = tag_chunk(chunk["text"], TAXONOMY)
            chunk["tags"] = tags

        # Save tagged output
        output_path = os.path.join(OUTPUT_FOLDER, f"{filename[:-5]}_tagged.json")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2, ensure_ascii=False)

        print(f" Tagged and saved to: {output_path}")



 Processing 4_Small_Habits_mMHNvy9pFj0.json
 Tagged and saved to: c:\Users\alvar\OneDrive\文件\Iron Hack\ALMA-Chatbot\TaggedChunks\4_Small_Habits_mMHNvy9pFj0_tagged.json
 Processing Foods_Control_Our_Moods_Q4qWzbP0q7I.json
 Tagged and saved to: c:\Users\alvar\OneDrive\文件\Iron Hack\ALMA-Chatbot\TaggedChunks\Foods_Control_Our_Moods_Q4qWzbP0q7I_tagged.json
 Processing Master_Your_Sleep_lIo9FcrljDk.json
 Tagged and saved to: c:\Users\alvar\OneDrive\文件\Iron Hack\ALMA-Chatbot\TaggedChunks\Master_Your_Sleep_lIo9FcrljDk_tagged.json
 Processing Powerful_Happiness_Hacks_14-DJFPm1_4.json
 Tagged and saved to: c:\Users\alvar\OneDrive\文件\Iron Hack\ALMA-Chatbot\TaggedChunks\Powerful_Happiness_Hacks_14-DJFPm1_4_tagged.json
 Processing Young_Forever_gO_x3gnXBzg.json
 Tagged and saved to: c:\Users\alvar\OneDrive\文件\Iron Hack\ALMA-Chatbot\TaggedChunks\Young_Forever_gO_x3gnXBzg_tagged.json
 Processing Your_Diet_is_Changing_Your_Brain_NbymuYEEqlE.json
 Tagged and saved to: c:\Users\alvar\OneDrive\文件\Iron Ha

##### Now we embed these chunks and push them into a vector DB

Embedding + vector storage phase next (using Pinecone)

In [2]:
# Load environment variables
load_dotenv(find_dotenv())
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "alma-index"

# Connect to Pinecone index
if index_name not in pc.list_indexes().names():
    raise ValueError(f" Index '{index_name}' not found.")
index = pc.Index(index_name)

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)

# Folder where tagged chunks are saved
TAGGED_FOLDER = os.path.abspath(os.path.join(os.getcwd(), "..", "TaggedChunks"))

# Loop through tagged JSONs and add to Pinecone
for filename in os.listdir(TAGGED_FOLDER):
    if filename.endswith(".json"):
        print(f" Indexing {filename}")
        filepath = os.path.join(TAGGED_FOLDER, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            chunks = json.load(f)

        docs = []
        for chunk in chunks:
            metadata = {
                "topic": chunk.get("topic"),
                "tags": ", ".join(chunk.get("tags", [])),
                "video_title": chunk.get("video_title"),
                "video_id": chunk.get("video_id"),
                "start_time": chunk.get("start_time"),
                "end_time": chunk.get("end_time")
            }
            docs.append(Document(page_content=chunk["text"], metadata=metadata))

        # Use the updated PineconeVectorStore (new SDK-compatible)
        vectorstore = PineconeVectorStore.from_documents(
            docs, embedding=embeddings, index_name=index_name
        )

print("- All documents successfully indexed to Pinecone!")


  embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)


 Indexing 4_Small_Habits_mMHNvy9pFj0_tagged.json
 Indexing Foods_Control_Our_Moods_Q4qWzbP0q7I_tagged.json
 Indexing Master_Your_Sleep_lIo9FcrljDk_tagged.json
 Indexing Powerful_Happiness_Hacks_14-DJFPm1_4_tagged.json
 Indexing Young_Forever_gO_x3gnXBzg_tagged.json
 Indexing Your_Diet_is_Changing_Your_Brain_NbymuYEEqlE_tagged.json
- All documents successfully indexed to Pinecone!
