In [2]:
from flask import Flask, render_template, request, jsonify
import threading, uuid
from datetime import datetime
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.chains.summarize import load_summarize_chain
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_openai import OpenAI
from langchain.chains import RetrievalQA
import re
import random, time
from youtube_transcript_api import YouTubeTranscriptApi
from langchain_community.document_loaders import TextLoader
from langchain.docstore.document import Document
from langchain_openai import ChatOpenAI
import os

In [3]:
def get_video_id_from_url(url):
    video_id = re.search(r'(?<=v=)[^&#]+', url)
    if video_id is None:
        video_id = re.search(r'(?<=be/)[^&#]+', url)
    return video_id.group(0) if video_id else None

def save_transcript_to_file(video_url, output_file, session_id):
    video_id = get_video_id_from_url(video_url)
    if video_id is None:
        print("Invalid YouTube URL")
        return

    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
    except Exception as e:
        print(f"An error occurred: {e}")
        return

    local_timestamps = transcripts_with_timestamps[session_id] = {}
    with open(output_file, "w", encoding="utf-8") as f:
        for entry in transcript:
            f.write(entry["text"] + " ")
            local_timestamps[entry['text']] = entry['start']

def get_timestamp(session_id, sentence):
    # Check if the sentence exists in the transcript and print the timestamp
    for transcript_sentence, timestamp in transcripts_with_timestamps[session_id].items():
        if sentence in transcript_sentence:
            #print(f"A sentence containing '{sentence}' starts at {timestamp} seconds in the video.")
            return int(timestamp)
    return None

In [20]:
def save_transcript():
    start_time = time.time()  # Start the timer
    session_id = str(uuid.uuid4())  # Generate a unique ID for this session
    transcript_file = f"transcripts/{datetime.now().strftime('%Y%m%d%H%M%S')}_{random.randint(0,100)}.txt"
    video_url = request.form['video_url']
    video_urls[session_id] = video_url.split("&")[0]
    save_transcript_to_file(video_url, transcript_file, session_id)
    print(f"Save transcript took {time.time() - start_time} seconds to execute.")
    start_time = time.time()  # Start the timer


    loader = TextLoader(transcript_file)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)
    print(f"Split QA texts took {time.time() - start_time} seconds to execute.")
    start_time = time.time()  # Start the timer

    with open(transcript_file, "r", encoding="utf-8") as f:
        transcript = f.read()
    texts_sum = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0).split_text(transcript)
    print(f"Split Sum texts took {time.time() - start_time} seconds to execute.")
    start_time = time.time()  # Start the timer
    # Create Document objects from the transcript parts
    docs = [Document(page_content=t) for t in texts_sum[:3]]
    print(len(texts_sum))
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    docsearch = Chroma.from_documents(texts, embeddings)
    qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0), chain_type="stuff", 
                                     retriever=docsearch.as_retriever(
                                     search_type="similarity_score_threshold",
                                     search_kwargs={'k':5, 'fetch_k': 50, 'score_threshold': 0.7}
                                     ), 
                                     return_source_documents=True)
    
    indexes[session_id] = qa
    print(f"QA Embedding took {time.time() - start_time} seconds to execute.")
    start_time = time.time()

In [21]:
def query():
    start_time = time.time()  # Start the timer
    
    session_id = request.form['session_id']  # The client must send the session ID with each request
    if session_id in indexes:
        user_query = request.form['query']
        output = indexes[session_id]({"query": user_query})

        result = output["result"]
        #print(output["source_documents"])
        print(f"Answering took {time.time() - start_time} seconds to execute.")
        start_time = time.time()  # Start the timer
        clip_links = []
        for doc in output["source_documents"]:
            timestamp = get_timestamp(session_id, doc.page_content[:20])
            if timestamp is not None:  # If the timestamp is not None, add it to the timestamp_str
                link = video_urls[session_id]+"&t="+str(timestamp)+"s"
                clip_links.append(link)

        return jsonify({'result': result, 'clip_links': clip_links})
    else:
        return "No transcript loaded", 400

In [4]:
session_id = str(uuid.uuid4())
session_id

'7bde9362-4aed-40ad-a505-8410f8d3449a'

In [5]:
video_URLs = ["https://www.youtube.com/watch?v=dtp6b76pMak","https://www.youtube.com/watch?v=XxOh12Uhg08"]
video_IDs = []
for url in video_URLs:
    id = get_video_id_from_url(url)
    video_IDs.append(id)
    print(id)

dtp6b76pMak
XxOh12Uhg08


In [8]:
transcripts = []
for id in video_IDs:
    transcripts.append(YouTubeTranscriptApi.get_transcript(id))
print(len(transcripts))

2


In [9]:
local_timestamps = {} #key session, value {key:video_id, value{key: text, value:start_timestamp}}

# create directory
directory = f"transcripts/{session_id}"
if not os.path.exists(directory):
    os.makedirs(directory)
# cobine the transcripts
for transcript,video_id in zip(transcripts,video_IDs):
    with open(f"{directory}/{video_id}.txt", "w", encoding="utf-8") as f:
        local_timestamps[video_id] = {}
        for entry in transcript:
            f.write(entry["text"] + " ")
            local_timestamps[video_id][entry["text"]] = entry['start']

In [10]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=0)

documents = []
for video_id in video_IDs:
    file_path = f"{directory}/{video_id}.txt"
    loader = TextLoader(file_path)
    documents.extend(loader.load())
    
# Split the documents into smaller chunks
texts = text_splitter.split_documents(documents)

In [16]:
# test manully 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=0)

documents = []
for file_path in ["interview1.txt","interview3.txt"]:
    loader = TextLoader(file_path)
    documents.extend(loader.load())
    
# Split the documents into smaller chunks
texts = text_splitter.split_documents(documents)


In [18]:
texts

[Document(page_content="Cool. And yeah, just the first question is, I'm curious to know, I know that you are UX researcher yourself and I'm wondering what is your process of doing research in your work? My process is I start a meeting, I record it on Zoom as well as kind of how we're doing today and then I usually go through, I organize all my files by participant, by date, I have video recording and then I also transcribe the audio file so I have it in text format as well to easily be able to review. And then from there I usually add any insights that I see either by re-listening to the recording or by viewing the transcription into a digital affinity map and I generally use Figma for that and that's how I kind of am able to organize my findings and put them into a report that I then deliver to stakeholders. Okay, got it. So you said that you will use another software for transcribing, right? Do you know, could you give us an example of what software that you use for transcribing? Rig

In [23]:
import os
os.environ["OPENAI_API_KEY"] = ""

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Chroma.from_documents(texts, embeddings)
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model="gpt-4-0125-preview", temperature=0), chain_type="stuff", 
                                 retriever=vectorstore.as_retriever(
                                     ), 
                                 return_source_documents=True)



In [29]:
output = qa({"query":"give me three insights in this video"})



In [30]:
output['result']


"Based on the provided context, here are three insights from the discussion about the technology and user experience:\n\n1. **Human Element in AI Insights**: The conversation highlights a common concern with AI-driven analysis tools, such as Envision, which automatically generate insights from video content and transcriptions. While these tools can efficiently identify and highlight key points, there's a recognition that they lack the nuanced understanding and interpretive depth that a human researcher can provide. This underscores the ongoing need for human oversight in qualitative research to ensure that insights are not only relevant but deeply connected to the research objectives.\n\n2. **Affinity Mapping for Deeper Analysis**: The mention of an affinity map feature as a desirable tool indicates the importance of organizing and categorizing insights based on themes. This approach allows researchers to delve deeper into the data, uncovering patterns and connections that might not be

In [31]:
output['source_documents']

[Document(page_content="someone was launching that's kind of similar where it has the video, it pulls out the insights and then you can also go through the script, the text and highlight additional things that you think are interesting because I mean, AI does its best, but it doesn't have the human touch, right? I'll tell you the name of it. It was called, I was just looking in, it's called Envision, but it's spelled, are you familiar with it? I have a card of it. Is it, is it an V.I? I just, I copy the name and chat for you. So maybe it might be helpful to check it out since you're still in discovery. Yeah. And it was nice. It was organized. It had the video, it had the transcription already ready to go, but I think that, that human element for me was still missing. So I still prefer to go through and do it myself, maybe because I don't trust the insights that it's pulling as much as I trust my own brain. But I did like that it would automatically, once you uploaded the video, it woul

'The videos are about using the Vision Pro.'

In [213]:
import json
from pathlib import Path
from pprint import pprint

def save_to_json(transcript,filename):
    with open(filename, 'w') as f:
        json.dump(transcript, f, indent=4) 

def load_documents(filepath):
    def metadata_func(record: dict, metadata: dict) -> dict:
        metadata["start"] = record.get("start")
        if "end" in record.keys():
            metadata["end"] = record.get("end")
        return metadata

    loader = JSONLoader(
        file_path=filepath,
        jq_schema='.[]',
        text_content=True,
        content_key="text",
        metadata_func = metadata_func
    )

    documents = loader.load()

    return documents

transcript = YouTubeTranscriptApi.get_transcript("dtp6b76pMak")
save_to_json(transcript,"transcript.json")
documents = load_documents("transcript.json")
        

In [201]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vectorstore = Chroma.from_documents(documents, embeddings)
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0), chain_type="stuff", 
                                 retriever=vectordb.as_retriever(
                                     # search_type="similarity_score_threshold",
                                     # search_kwargs={'k':5, 'fetch_k': 50, 'score_threshold': 0.7}
                                 ), 
                                 return_source_documents=True)



In [228]:
transcript

"(upbeat music) - All right, so you've seen the unboxing. Now it's time for the breakdown. What is using the Apple\nVision Pro actually like? This is easily one of Apple's\ncraziest, most radical, possibly dystopian products of all time. And I have a lot of thoughts here, like I've been using it\nfor about a week now. There are some parts of this thing that are absolutely incredible, and some other parts that feel weird, or borderline unfinished. There are all kinds of new technologies, from a new operating system\nto infrared eye tracking to virtually reconstructed\nversions of you. I feel like there are so\nmany actually new things that you have to understand\nin order to get a sense of what this headset\nactually is and what it does. So I'm gonna break this\ndown into two parts. This video is all about\nusing the Vision Pro. It's everything I've\nlearned from the past week of wearing and getting used to\nthis thing every single day. But I'm also working\non a more wide ranging, poss

In [227]:
with open("transcript.txt") as f:
    transcript = f.read()



In [236]:
from langchain_experimental.text_splitter import SemanticChunker

t1 = "Data science is the study of data to extract meaningful insights for business. It is a multidisciplinary approach that combines principles and practices from the fields of mathematics, statistics, artificial intelligence, and computer engineering to analyze large amounts of data. This analysis helps data scientists to ask and answer questions like what happened, why it happened, what will happen, and what can be done with the results."

t2 = "Data science is important because it combines tools, methods, and technology to generate meaning from data. Modern organizations are inundated with data; there is a proliferation of devices that can automatically collect and store information. Online systems and payment portals capture more data in the fields of e-commerce, medicine, finance, and every other aspect of human life. We have text, audio, video, and image data available in vast quantities."

t3 = "Descriptive analysis examines data to gain insights into what happened or what is happening in the data environment. It is characterized by data visualizations such as pie charts, bar charts, line graphs, tables, or generated narratives. For example, a flight booking service may record data like the number of tickets booked each day. Descriptive analysis will reveal booking spikes, booking slumps, and high-performing months for this service."


text_splitter = SemanticChunker(OpenAIEmbeddings())
docs = text_splitter.create_documents([t1,t2,t3],[{"start":1,"end":2},{"start":3,"end":4},{"start":5,"end":6}])

In [237]:
docs

[Document(page_content='Data science is the study of data to extract meaningful insights for business. It is a multidisciplinary approach that combines principles and practices from the fields of mathematics, statistics, artificial intelligence, and computer engineering to analyze large amounts of data.', metadata={'start': 1, 'end': 2}),
 Document(page_content='This analysis helps data scientists to ask and answer questions like what happened, why it happened, what will happen, and what can be done with the results.', metadata={'start': 1, 'end': 2}),
 Document(page_content='Data science is important because it combines tools, methods, and technology to generate meaning from data. Modern organizations are inundated with data; there is a proliferation of devices that can automatically collect and store information.', metadata={'start': 3, 'end': 4}),
 Document(page_content='Online systems and payment portals capture more data in the fields of e-commerce, medicine, finance, and every ot