In [5]:
from urllib.parse import urlparse, parse_qs
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
# ===================================================================================================
def get_video_id(youtube_url):
    parsed_url = urlparse(youtube_url)
    if parsed_url.netloc in ["www.youtube.com", "youtube.com"]:
        return parse_qs(parsed_url.query).get("v", [None])[0]
    elif parsed_url.netloc in ["youtu.be"]:
        return parsed_url.path.lstrip("/")
    return None

def get_youtube_subtitles(video_url, language="en"):
    video_id = get_video_id(video_url)
    if not video_id:
        return "Không thể trích xuất ID video từ URL."
    
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
        subtitles_text = " ".join([item['text'] for item in transcript])
        subtitles_with_timestamps = "\n".join([f"{item['start']:.2f}s: {item['text']}" for item in transcript])
        return {
            "text": subtitles_text,
            "with_timestamps": subtitles_with_timestamps,
            "video_id": video_id
        }
    except Exception as e:
        return f"Lỗi khi lấy phụ đề: {e}"

def preprocess_text(text):
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text

def create_vector_store(text):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    chunks = [chunk.strip() for chunk in text_splitter.split_text(text) if chunk.strip()]
    # chunks = text_splitter.split_text(text)
    
    # embeddings = HuggingFaceEmbeddings(
    #     model_name="sentence-transformers/all-MiniLM-L6-v2",
    #     model_kwargs={'device': 'cuda:0' if torch.cuda.is_available() else 'cpu'}
    # )
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        encode_kwargs={'batch_size': 32}  
    )
    vector_store = FAISS.from_texts(chunks, embeddings)
    
    return vector_store

def generate_summary(text, max_length=150):
    model_name = "philschmid/bart-large-cnn-samsum" 
    
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    
    model = model.to(device)
    chunk_size = 1024
    chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
    
    intermediate_summaries = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", max_length=1024, truncation=True).to(device)
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=512, 
            min_length=50, 
            num_beams=4, 
            early_stopping=True
        )
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        intermediate_summaries.append(summary)
    
    combined_text = " ".join(intermediate_summaries)
    # inputs = tokenizer(combined_text, return_tensors="pt", max_length=1024, truncation=True).to(device)
    # summary_ids = model.generate(inputs["input_ids"], max_length=1024, min_length=50, num_beams=4, early_stopping=True)
    # final_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # return final_summary
    return combined_text

def setup_llm():
    model_name = "microsoft/phi-2" 
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16,
        trust_remote_code=True,
    )
    
    pipe = pipeline(
        "text-generation",
        model=model, 
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.7,
        do_sample=True,
        top_p=0.95,
        repetition_penalty=1.2
    )
    
    llm = HuggingFacePipeline(pipeline=pipe)
    return llm

def create_chat_chain(vector_store, llm):
    # memory = ConversationBufferMemory(
    #     memory_key="chat_history",
    #     return_messages=True,
    #     output_key="answer" 
    # )

    chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=vector_store.as_retriever(search_kwargs={"k": 5}),
        memory=None,
        return_source_documents=False
    )
    return chain
# ===================================================================================================
def get_video_id(youtube_url):
    parsed_url = urlparse(youtube_url)
    if parsed_url.netloc in ["www.youtube.com", "youtube.com"]:
        return parse_qs(parsed_url.query).get("v", [None])[0]
    elif parsed_url.netloc in ["youtu.be"]:
        return parsed_url.path.lstrip("/")
    return None

def list_available_subtitles(video_id):
    try:
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        available_languages = [trans.language_code for trans in transcript_list]
        return available_languages
    except TranscriptsDisabled:
        return "Phụ đề bị tắt trên video này."
    except Exception as e:
        return f"Lỗi khi lấy danh sách phụ đề: {e}"

def get_youtube_subtitles(video_id, language):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[language])
        subtitles_text = " ".join([item['text'] for item in transcript])
        subtitles_with_timestamps = "\n".join([f"{item['start']:.2f}s: {item['text']}" for item in transcript])
        return {
            "text": subtitles_text,
            "with_timestamps": subtitles_with_timestamps,
            "video_id": video_id
        }
    except NoTranscriptFound:
        return f"Không tìm thấy phụ đề cho ngôn ngữ '{language}'."
    except Exception as e:
        return f"Lỗi khi lấy phụ đề: {e}"

def process_video(video_url):
    print("Đang lấy danh sách phụ đề có sẵn...")
    video_id = get_video_id(video_url)
    if not video_id:
        print("Không thể trích xuất ID video từ URL.")
        return
    
    available_languages = list_available_subtitles(video_id)
    if isinstance(available_languages, str):
        print(available_languages)
        return
    
    valid_languages = [lang for lang in ["en", "vi"] if lang in available_languages]
    if not valid_languages:
        print("Không có phụ đề tiếng Anh hoặc tiếng Việt cho video này.")
        return
    
    print("Các phụ đề có sẵn:(thiệt ra là có nhiều hơn nhưng chỉ xử lý cho en với vi thôi nha)", valid_languages)
    chosen_language = input("Chọn ngôn ngữ phụ đề (en/vi): ").strip()
    if chosen_language in valid_languages:
        print("Đang lấy phụ đề...")
        subtitles_data = get_youtube_subtitles(video_id, chosen_language)
        if isinstance(subtitles_data, str):
            print(subtitles_data)
        else:
            print("Phụ đề:")
            print(subtitles_data["text"])

    # ========================================================================================
        video_id = subtitles_data["video_id"]
    
        subtitles_text = subtitles_data["text"]
        processed_text = preprocess_text(subtitles_text)
        print("Đang tạo vector store...")
        vector_store = create_vector_store(processed_text)
        
        print("Đang tạo tóm tắt nội dung video...")
        summary = generate_summary(processed_text,chosen_language)
        
        print("\n=== TÓM TẮT NỘI DUNG VIDEO ===")
        print(summary)
        print("===============================\n")
        
        print("Đang khởi tạo mô hình chat...")
        llm = setup_llm(chosen_language)
        chain = create_chat_chain(vector_store, llm,chosen_language)
        
        print("\nBạn có thể bắt đầu chat về nội dung video. Gõ 'exit' để thoát.")
        
        while True:
            query = input("\nBạn: ")
            if query.lower() == 'exit':
                break
            
            formatted_query = f"""
            Based on the YouTube video subtitles, please answer the following question:
            
            Question: {query}
            
            Only respond using the information found in the video subtitles. If the information is not mentioned, state that it is not available in the video.
            """
            
            # response = chain({"question": formatted_query})
            response = chain({"question": formatted_query, "chat_history": []})
            print(f"\nAssistant: {response['answer']}")
        
    # ========================================================================================
    
    else:
        print("Ngôn ngữ không hợp lệ hoặc không có phụ đề cho lựa chọn đó.")
    

print("=== YouTube Video Chat App ===")
video_url = input("Nhập link video YouTube: ")
process_video(video_url)


=== YouTube Video Chat App ===


Nhập link video YouTube:  https://www.youtube.com/watch?v=Up6tk1hliIM&t=113s


Đang lấy danh sách phụ đề có sẵn...
Các phụ đề có sẵn: ['en']


Chọn ngôn ngữ phụ đề (en/vi):  en


Đang lấy phụ đề...
Phụ đề:
[Applause] as you already know the Assassins bullet came within a quarter of an inch of taking my life so many people have asked me what happened tell us what happened please and therefore I will tell you exactly what happened and you'll never hear it from me a second time because it's actually too painful to tell it was a warm beautiful day in the Earth early evening in Butler Township in the great Commonwealth of [Music] Pennsylvania music was loudly playing and the campaign was doing really well I went to the stage and the crowd was cheering wildly everybody was happy I began speaking very strongly powerfully and happily because I was discussing the great job my Administration did on immigration at the southern border we were very proud of it behind me and to the right was a large screen that was displaying a chart of border crossings under my leadership the numbers were absolutely amazing in order to see the chart I started to like this turn to my right a