In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# import os
# os.kill(os.getpid(), 9)

In [None]:
pip install pymongo

In [None]:
!pip install -q transformers qdrant-client pillow open-clip-torch

import open_clip
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance


QDRANT_URL = "https://0ed1052e-7f23-462b-ab8a-70aaa05c675f.us-west-2-0.aws.cloud.qdrant.io"
QDRANT_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.svTWXXiOcA-W59rQFl-g_qekNafaa95ozDuL3lq6s_4"
COLLECTION_NAME = "Video_RAG"


client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)


model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="openai"
)
tokenizer = open_clip.get_tokenizer("ViT-B-32")
model = model.cuda().eval()
DIM = model.text_projection.shape[1]  # ✅ 512


if client.collection_exists(COLLECTION_NAME):
    client.delete_collection(COLLECTION_NAME)


client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={
        "image": VectorParams(size=DIM, distance=Distance.COSINE),
        "text": VectorParams(size=DIM, distance=Distance.COSINE),
    }
)



In [None]:
import json
import os
import torch
import open_clip
import numpy as np
from tqdm import tqdm
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance, PointStruct, Batch
from qdrant_client.http.models import Filter, FieldCondition, MatchValue


QDRANT_URL = "https://0ed1052e-7f23-462b-ab8a-70aaa05c675f.us-west-2-0.aws.cloud.qdrant.io"
QDRANT_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.svTWXXiOcA-W59rQFl-g_qekNafaa95ozDuL3lq6s_4"
COLLECTION_NAME = "Video_RAG"


qdrant = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY
)


model, _, preprocess = open_clip.create_model_and_transforms(
    "ViT-B-32", pretrained="openai"
)
tokenizer = open_clip.get_tokenizer("ViT-B-32")
model = model.cuda().eval()


if COLLECTION_NAME not in [c.name for c in qdrant.get_collections().collections]:
    qdrant.recreate_collection(
        collection_name=COLLECTION_NAME,
        vectors_config={
            "text": VectorParams(size=512, distance=Distance.COSINE),
        }
    )


JSONL_PATH = "/content/drive/MyDrive/whole_merged_subtitles_summarized.jsonl"
IMAGE_ROOT = "/content/drive/MyDrive/img_data"


points = []
with open(JSONL_PATH, "r", encoding="utf-8") as f:
    for i, line in enumerate(tqdm(f, desc="Encoding & uploading")):
        obj = json.loads(line)

        try:
            text = obj["text"]
            text_tokens = tokenizer([text]).cuda()

            with torch.no_grad():
                text_vec = model.encode_text(text_tokens).squeeze().cpu().numpy()

            points.append(PointStruct(
                id=i,
                vector={"text": text_vec.tolist()},
                payload={
                    "video": obj["video"],
                    "start": obj["start"],
                    "end": obj["end"],
                    "timestamp": obj["timestamp"],
                    "text": obj["text"],
                    "image_path": obj["image_path"]
                }
            ))


            if len(points) >= 100:
                qdrant.upsert(collection_name=COLLECTION_NAME, points=points)
                points = []

        except Exception as e:
            print(f" Error at line {i}: {e}")
            continue


if points:
    qdrant.upsert(collection_name=COLLECTION_NAME, points=points)

print("finished")

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""

In [None]:
import os
import json
import time
import base64
from tqdm import tqdm
from openai import OpenAI, RateLimitError, APIError, APIConnectionError, InternalServerError

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
if not client.api_key:
    raise ValueError("OPENAI_API_KEY export")


JSONL_PATH = "/content/drive/MyDrive/whole_merged_subtitles.jsonl"
IMAGE_ROOT = "/content/drive/MyDrive/img_data"
OUTPUT_PATH = "/content/drive/MyDrive/whole_merged_subtitles_summarized.jsonl"


with open(JSONL_PATH, 'r', encoding='utf-8') as infile, \
     open(OUTPUT_PATH, 'w', encoding='utf-8') as outfile:

    for line in tqdm(infile, desc="Processing segments"):
        entry = json.loads(line)
        image_path = os.path.join(IMAGE_ROOT, os.path.basename(entry["image_path"]))

        if not os.path.exists(image_path):
            print(f" {image_path}，jumped")
            continue

        with open(image_path, "rb") as f:
            image_base64 = base64.b64encode(f.read()).decode("utf-8")
        image_data_uri = f"data:image/jpeg;base64,{image_base64}"

        prompt_text = (
            f"This is a screenshot from an AI course video.\n"
            f"Transcript:\n{entry['text']}\n\n"
            f"Please summarize this segment by picture and transcript in one short English paragraph."
        )

        retries = 3
        while retries > 0:
            try:
                response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant that summarizes educational video segments."},
                        {
                            "role": "user",
                            "content": [
                                {"type": "text", "text": prompt_text},
                                {"type": "image_url", "image_url": {"url": image_data_uri}}
                            ]
                        }
                    ],
                    max_tokens=200,
                    temperature=0.2,
                )
                summary = response.choices[0].message.content.strip()
                entry["text"] = summary
                outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
                break

            except RateLimitError as e:
                print(f"Rate limit (429). Waiting... {entry['timestamp']}")
                time.sleep(15)
                retries -= 1

            except (APIError, APIConnectionError, InternalServerError) as e:
                print(f"API error: {e} | {entry['timestamp']}, retrying...")
                time.sleep(10)
                retries -= 1

            except Exception as e:
                print(f" Fatal error at {entry['timestamp']}: {e}")
                break

print(OUTPUT_PATH)