In [None]:
import yt_dlp
import os

# Step 1: Download Audio from YouTube with Cookies
def download_audio(youtube_url, output_path="audio.mp3"):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': output_path.replace('.mp3', ''),  # Remove .mp3 for yt-dlp to handle extension
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'cookiefile': 'cookies.txt'  # Use cookies to bypass restrictions
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])

    # Fix the file name if it ends up as audio.mp3.mp3
    downloaded_file = output_path.replace('.mp3', '.mp3.mp3')
    if os.path.exists(downloaded_file):
        os.rename(downloaded_file, output_path)

# Replace with your YouTube link
youtube_url = "https://youtu.be/sK8SILOM37I"
download_audio(youtube_url)

In [None]:
#!pip install yt_dlp

In [None]:
from IPython.display import Audio

# Path to the downloaded audio file
audio_file = "audio.mp3"

# Play the audio
Audio(audio_file)

In [None]:
#!pip install youtube_transcript_api

In [None]:
#!pip install pydub

In [None]:
#!pip install SpeechRecognition

In [None]:
#!pip install pytube

In [None]:
pip install deepmultilingualpunctuation

In [None]:
import re
import urllib.parse
import requests
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
import speech_recognition as sr
from pydub import AudioSegment
from deepmultilingualpunctuation import PunctuationModel
import os
# Initialize model once at the top
model = PunctuationModel()

def extract_video_id(video_url):
    """
    Extracts the YouTube video ID from various URL formats.
    """
    parsed_url = urllib.parse.urlparse(video_url)
    query_params = urllib.parse.parse_qs(parsed_url.query)

    if "v" in query_params:
        return query_params["v"][0]

    match = re.search(r"(youtu\.be/|youtube\.com/embed/|youtube\.com/shorts/)([\w-]+)", video_url)
    if match:
        return match.group(2)

    return None

def download_audio(video_url):
    """
    Downloads the audio using yt-dlp with cookies and returns the file path.
    """
    try:
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': 'audio.%(ext)s',
            'cookiefile': 'cookies (1).txt',  # Use the exported cookies
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(video_url, download=True)
            return "audio.mp3"
    except Exception as e:
        return f"Error downloading audio: {str(e)}"

def convert_audio_to_wav(audio_file):
    """
    Converts the downloaded MP3 audio to WAV format using pydub.
    """
    wav_file = "audio.wav"
    try:
        AudioSegment.from_mp3(audio_file).export(wav_file, format="wav")
        return wav_file
    except Exception as e:
        return f"Error converting to WAV: {str(e)}"

def transcribe_audio(audio_path, chunk_length=30):
    """
    Splits audio into smaller chunks, transcribes each chunk separately,
    and adds punctuation using deepmultilingualpunctuation library.
    """
    recognizer = sr.Recognizer()
    audio = AudioSegment.from_wav(audio_path)
    total_duration = len(audio) / 1000  # Convert to seconds
    transcribed_text = []

    # Load punctuation model
    model = PunctuationModel()

    print("Transcribing audio in chunks...")

    # In transcribe_audio()
    punctuated_chunks = []
    for chunk_text in transcribed_text:
        punctuated = model.restore_punctuation(chunk_text)
        punctuated_chunks.append(punctuated)
        return " ".join(punctuated_chunks)

    # Split and transcribe audio in chunks
    for start in range(0, int(total_duration), chunk_length):
        end = min(start + chunk_length, int(total_duration))
        chunk = audio[start * 1000:end * 1000]  # Extract chunk in milliseconds
        chunk.export("chunk.wav", format="wav")  # Save chunk temporarily

        with sr.AudioFile("chunk.wav") as source:
            try:
                audio_data = recognizer.record(source)
                text = recognizer.recognize_google(audio_data)
                transcribed_text.append(text)
            except sr.UnknownValueError:
                transcribed_text.append("[Unintelligible]")
            except sr.RequestError as e:
                return f"Error with the speech recognition service: {str(e)}"

    os.remove("chunk.wav")  # Clean up temporary chunk file

    # Combine chunks and add punctuation
    combined_text = " ".join(transcribed_text)
    punctuated_text = model.restore_punctuation(combined_text)

    return punctuated_text

def get_transcript_unlisted(video_url):
    """
    Tries to fetch the transcript using youtube_transcript_api first,
    then falls back to downloading and transcribing audio if necessary.
    """
    model = PunctuationModel()  # Initialize once
    video_id = extract_video_id(video_url)

    if not video_id:
        return "Invalid YouTube URL."

    # Try API path with punctuation
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        raw_text = " ".join([item['text'] for item in transcript])
        return model.restore_punctuation(raw_text)  # <-- Critical fix
    except:
        print("Transcript not available via API, attempting audio transcription...")

    # Audio fallback path (existing implementation)
    # ... rest of audio processing code ...
    # Download and transcribe audio if no transcript is available
    audio_file = download_audio(video_url)
    if "Error" in audio_file:
        return audio_file

    wav_file = convert_audio_to_wav(audio_file)
    if "Error" in wav_file:
        return wav_file

    transcription = transcribe_audio(wav_file)

    # Cleanup temporary files
    os.remove(audio_file)
    os.remove(wav_file)

    return transcription

# Example usage
# Example usage
if __name__ == "__main__":
    video_url = input("Enter the YouTube video URL: ")
    transcript = get_transcript_unlisted(video_url)

    # Save transcript to a text file
    if "Error" not in transcript and "Invalid YouTube URL." not in transcript:
        output_file = "transcript.txt"
        with open(output_file, "w", encoding="utf-8") as file:
            file.write(transcript)
        print(f"\nTranscript saved successfully to {output_file}")
    else:
        print("\n", transcript)

In [None]:
import re
import os

def format_transcript_sentences(input_file, output_file=None):
    """
    Processes a transcript text file to add line breaks after sentences.

    Args:
        input_file: Path to the original transcript file
        output_file: Path for formatted file (default: adds '_formatted' suffix)

    Returns:
        Path to the formatted file or error message
    """
    try:
        # Read input file
        with open(input_file, 'r', encoding='utf-8') as f:
            raw_text = f.read().replace('\n', ' ')  # Remove existing newlines

        # Split into sentences using punctuation followed by whitespace
        sentences = re.split(r'(?<=[.!?]) +', raw_text)

        # Format with each sentence on new line and proper capitalization
        formatted_text = []
        for sentence in sentences:
            sentence = sentence.strip()
            if sentence:
                # Capitalize first letter of each sentence
                formatted_sentence = sentence[0].upper() + sentence[1:]
                formatted_text.append(formatted_sentence)

        formatted_text = '\n'.join(formatted_text)

        # Create output filename if not provided
        if not output_file:
            base, ext = os.path.splitext(input_file)
            output_file = f"{base}_formatted{ext}"

        # Write formatted text
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(formatted_text)

        return output_file

    except FileNotFoundError:
        return f"Error: File '{input_file}' not found"
    except Exception as e:
        return f"Error processing file: {str(e)}"

# Example usage
if __name__ == "__main__":
    input_path = input("Enter path to transcript file: ").strip()
    result = format_transcript_sentences(input_path)

    if "Error" in result:
        print(f"\n{result}")
    else:
        print(f"\nFormatted transcript saved to: {result}")
        print("\nFirst 5 lines of formatted text:")
        with open(result, 'r', encoding='utf-8') as f:
            print(''.join(f.readlines()[:5]))

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

def semantic_segmentation(input_file, output_file=None, min_length=3, threshold=0.65):
    """
    Segments text into meaningful chunks with semantic coherence and keywords.

    Args:
        input_file: Path to formatted transcript file
        output_file: Output path (default: adds '_segmented' suffix)
        min_length: Minimum sentences per segment
        threshold: Semantic similarity threshold (0-1)

    Returns:
        Path to segmented file or error message
    """
    try:
        # Load ML models
        model = SentenceTransformer('all-MiniLM-L6-v2')

        # Read and split sentences
        with open(input_file, 'r', encoding='utf-8') as f:
            sentences = [line.strip() for line in f if line.strip()]

        if len(sentences) < min_length:
            return f"Need at least {min_length} sentences for segmentation"

        # Generate sentence embeddings
        embeddings = model.encode(sentences)

        # Create segments with semantic coherence
        segments = []
        current_segment = []
        current_emb = None

        for sent, emb in zip(sentences, embeddings):
            emb = emb.reshape(1, -1)
            if not current_segment:
                current_segment.append(sent)
                current_emb = emb
                continue

            similarity = cosine_similarity(current_emb, emb)[0][0]
            if similarity >= threshold and len(current_segment) < 5:
                current_segment.append(sent)
                current_emb = (current_emb * len(current_segment) + emb) / (len(current_segment) + 1)
            else:
                if len(current_segment) >= min_length:
                    segments.append(current_segment)
                current_segment = [sent]
                current_emb = emb

        # Finalize remaining sentences
        if current_segment:
            if segments and len(current_segment) < min_length:
                segments[-1].extend(current_segment)
            else:
                segments.append(current_segment)

        # Extract keywords for each segment
        results = []
        for seg in segments:
            vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
            X = vectorizer.fit_transform([' '.join(seg)])
            features = vectorizer.get_feature_names_out()
            keywords = features[np.argsort(X.toarray())[0][-3:]][::-1]

            results.append({
                'sentences': seg,
                'keywords': keywords,
                'count': len(seg)
            })

        # Create output filename
        if not output_file:
            base, ext = os.path.splitext(input_file)
            output_file = f"{base}_segmented{ext}"

        # Write segmented output
        with open(output_file, "w") as f:
            for i, seg in enumerate(results, 1):
                f.write(f"Segment {i} ({seg['count']} sentences | Keywords: {', '.join(seg['keywords'])})\n")
                f.write('\n'.join(seg['sentences']) + '\n\n')

        return output_file

    except Exception as e:
        return f"Error during segmentation: {str(e)}"

# Example usage
if __name__ == "__main__":
    input_path = input("Enter path to formatted transcript file: ").strip()
    result = semantic_segmentation(input_path)

    if "Error" in result:
        print(f"\n{result}")
    else:
        print(f"\nSegmented transcript saved to: {result}")

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import os

def semantic_segmentation(input_file, output_file=None, min_length=3, threshold=0.65):
    """
    Segments text into meaningful chunks with semantic coherence and keywords.

    Args:
        input_file: Path to formatted transcript file
        output_file: Output path (default: adds '_segmented' suffix)
        min_length: Minimum sentences per segment
        threshold: Semantic similarity threshold (0-1)

    Returns:
        Path to segmented file or error message
    """
    try:
        # Load ML models
        model = SentenceTransformer('all-MiniLM-L6-v2')

        # Read and split sentences
        with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
            sentences = [line.strip() for line in f if line.strip()]

        if len(sentences) < min_length:
            return f"Need at least {min_length} sentences for segmentation"

        # Generate sentence embeddings
        embeddings = model.encode(sentences)

        # Create segments with semantic coherence
        segments = []
        current_segment = []
        current_emb = None

        for i, (sent, emb) in enumerate(zip(sentences, embeddings)):
            emb = emb.reshape(1, -1)
            if not current_segment:
                current_segment.append(sent)
                current_emb = emb
                continue

            similarity = cosine_similarity(current_emb, emb)[0][0]
            if similarity >= threshold and len(current_segment) < 5:
                current_segment.append(sent)
                current_emb = (current_emb * len(current_segment) + emb) / (len(current_segment) + 1)
            else:
                # Finalize segment if it meets minimum length
                if len(current_segment) >= min_length:
                    segments.append(current_segment)
                else:
                    # If too short, append to previous segment if possible
                    if segments:
                        segments[-1].extend(current_segment)
                    else:
                        segments.append(current_segment)
                current_segment = [sent]
                current_emb = emb

            # Force finalize segment at the end of the file
            if i == len(sentences) - 1:
                if len(current_segment) >= min_length:
                    segments.append(current_segment)
                else:
                    # If too short, append to previous segment if possible
                    if segments:
                        segments[-1].extend(current_segment)
                    else:
                        segments.append(current_segment)

        # Extract keywords for each segment
        results = []
        for seg in segments:
            vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
            X = vectorizer.fit_transform([' '.join(seg)])
            features = vectorizer.get_feature_names_out()
            keywords = features[np.argsort(X.toarray())[0][-3:]][::-1]

            results.append({
                'sentences': seg,
                'keywords': keywords,
                'count': len(seg)
            })

        # Create output filename
        if not output_file:
            base, ext = os.path.splitext(input_file)
            output_file = f"{base}_segmented{ext}"

        # Write segmented output
        with open(output_file, "w") as f:
            for i, seg in enumerate(results, 1):
                f.write(f"Segment {i} ({seg['count']} sentences | Keywords: {', '.join(seg['keywords'])})\n")
                f.write('\n'.join(seg['sentences']) + '\n\n')

        return output_file

    except Exception as e:
        return f"Error during segmentation: {str(e)}"

# Example usage
if __name__ == "__main__":
    input_path = input("Enter path to formatted transcript file: ").strip()
    result = semantic_segmentation(input_path)

    if "Error" in result:
        print(f"\n{result}")
    else:
        print(f"\nSegmented transcript saved to: {result}")

In [None]:
#Now doing the transcription with timestamps

In [None]:
import re
import urllib.parse
import requests
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
import speech_recognition as sr
from pydub import AudioSegment
from deepmultilingualpunctuation import PunctuationModel
import os
# Initialize model once at the top
model = PunctuationModel()

def extract_video_id(video_url):
    """
    Extracts the YouTube video ID from various URL formats.
    """
    parsed_url = urllib.parse.urlparse(video_url)
    query_params = urllib.parse.parse_qs(parsed_url.query)

    if "v" in query_params:
        return query_params["v"][0]

    match = re.search(r"(youtu\.be/|youtube\.com/embed/|youtube\.com/shorts/)([\w-]+)", video_url)
    if match:
        return match.group(2)

    return None

def download_audio(video_url):
    """
    Downloads the audio using yt-dlp with cookies and returns the file path.
    """
    try:
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': 'audio.%(ext)s',
            'cookiefile': 'cookies (1).txt',  # Use the exported cookies
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(video_url, download=True)
            return "audio.mp3"
    except Exception as e:
        return f"Error downloading audio: {str(e)}"

def convert_audio_to_wav(audio_file):
    """
    Converts the downloaded MP3 audio to WAV format using pydub.
    """
    wav_file = "audio.wav"
    try:
        AudioSegment.from_mp3(audio_file).export(wav_file, format="wav")
        return wav_file
    except Exception as e:
        return f"Error converting to WAV: {str(e)}"

def transcribe_audio(audio_path, chunk_length=30):
    """
    Splits audio into smaller chunks, transcribes each chunk separately,
    and adds punctuation using deepmultilingualpunctuation library.
    """
    recognizer = sr.Recognizer()
    audio = AudioSegment.from_wav(audio_path)
    total_duration = len(audio) / 1000  # Convert to seconds
    transcribed_text = []

    # Load punctuation model
    model = PunctuationModel()

    print("Transcribing audio in chunks...")

    # In transcribe_audio()
    punctuated_chunks = []
    for chunk_text in transcribed_text:
        punctuated = model.restore_punctuation(chunk_text)
        punctuated_chunks.append(punctuated)
        return " ".join(punctuated_chunks)

    # Split and transcribe audio in chunks
    for start in range(0, int(total_duration), chunk_length):
        end = min(start + chunk_length, int(total_duration))
        chunk = audio[start * 1000:end * 1000]  # Extract chunk in milliseconds
        chunk.export("chunk.wav", format="wav")  # Save chunk temporarily

        with sr.AudioFile("chunk.wav") as source:
            try:
                audio_data = recognizer.record(source)
                text = recognizer.recognize_google(audio_data)
                transcribed_text.append(text)
            except sr.UnknownValueError:
                transcribed_text.append("[Unintelligible]")
            except sr.RequestError as e:
                return f"Error with the speech recognition service: {str(e)}"

    os.remove("chunk.wav")  # Clean up temporary chunk file

    # Combine chunks and add punctuation
    combined_text = " ".join(transcribed_text)
    punctuated_text = model.restore_punctuation(combined_text)

    return punctuated_text

def get_transcript_unlisted(video_url):
    """
    Tries to fetch the transcript using youtube_transcript_api first,
    then falls back to downloading and transcribing audio if necessary.
    """
    model = PunctuationModel()  # Initialize once
    video_id = extract_video_id(video_url)

    if not video_id:
        return "Invalid YouTube URL."

    # Try API path with punctuation and timestamps
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        formatted_transcript = []
        for item in transcript:
            start_time = convert_time(item['start'])
            end_time = convert_time(item['start'] + item['duration'])
            formatted_transcript.append(f"[{start_time}-{end_time}] {item['text']}")
        return model.restore_punctuation(" ".join(formatted_transcript))
    except:
        print("Transcript not available via API, attempting audio transcription...")

    # Audio fallback path (existing implementation)
    # ... rest of audio processing code ...
    # Download and transcribe audio if no transcript is available
    audio_file = download_audio(video_url)
    if "Error" in audio_file:
        return audio_file

    wav_file = convert_audio_to_wav(audio_file)
    if "Error" in wav_file:
        return wav_file

    transcription = transcribe_audio(wav_file)

    # Cleanup temporary files
    os.remove(audio_file)
    os.remove(wav_file)

    # For audio fallback, timestamps are not directly available
    # You might need to manually add timestamps or use a different approach
    return transcription

def convert_time(seconds):
    """Converts seconds to [hrs:mins:seconds] format."""
    hrs = int(seconds // 3600)
    mins = int((seconds % 3600) // 60)
    secs = round(seconds % 60, 2)
    return f"{hrs:02d}:{mins:02d}:{secs:05.2f}"

# Example usage
# Example usage
if __name__ == "__main__":
    video_url = input("Enter the YouTube video URL: ")
    transcript = get_transcript_unlisted(video_url)

    # Save transcript to a text file
    if "Error" not in transcript and "Invalid YouTube URL." not in transcript:
        output_file = "transcript.txt"
        with open(output_file, "w", encoding="utf-8") as file:
            file.write(transcript)
        print(f"\nTranscript saved successfully to {output_file}")
    else:
        print("\n", transcript)

# Example usage
# Example usage
if __name__ == "__main__":
    video_url = input("Enter the YouTube video URL: ")
    transcript = get_transcript_unlisted(video_url)

    # Save transcript to a text file
    if "Error" not in transcript and "Invalid YouTube URL." not in transcript:
        output_file = "transcript.txt"
        with open(output_file, "w", encoding="utf-8") as file:
            file.write(transcript)
        print(f"\nTranscript saved successfully to {output_file}")
    else:
        print("\n", transcript)

In [None]:
pip install nltk scikit-learn

In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
import numpy as np
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
# Ensure required resources are downloaded
nltk.download('punkt')

# Function to read transcript from a .txt file
def read_transcript(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        transcript = file.read()
    return transcript

# Function to split transcript into individual sentences
def split_into_sentences(transcript):
    sentences = nltk.sent_tokenize(transcript)
    return sentences

# Function to compute cosine similarity between sentence pairs
def compute_cosine_similarity(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return similarity_matrix

# Function to find similar sentence triplets based on cosine similarity
def find_similar_triplets(sentences, similarity_matrix, threshold=0.5):
    triplets = []
    n = len(sentences)

    # Generate all combinations of triplets
    for comb in combinations(range(n), 3):
        i, j, k = comb
        # Check if all pairs within the triplet are similar
        if (similarity_matrix[i][j] > threshold and
            similarity_matrix[j][k] > threshold and
            similarity_matrix[i][k] > threshold):
            triplets.append([sentences[i], sentences[j], sentences[k]])

    return triplets

# Function to write the segmented sentences to a new .txt file
def write_segments_to_file(triplets, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        for idx, triplet in enumerate(triplets, 1):
            file.write(f"Segment {idx}:\n")
            for sentence in triplet:
                file.write(sentence + "\n")
            file.write("\n")

def main():
    input_file = '/content/transcript (5).txt'  # Input .txt file path
    output_file = 'segmented_transcript.txt'  # Output .txt file path

    # Reading and processing transcript
    transcript = read_transcript(input_file)
    sentences = split_into_sentences(transcript)
    similarity_matrix = compute_cosine_similarity(sentences)

    # Finding similar sentence triplets
    triplets = find_similar_triplets(sentences, similarity_matrix, threshold=0.5)

    # Writing segments to output file
    write_segments_to_file(triplets, output_file)

    print(f"Segmented transcript saved to {output_file}")

if __name__ == "__main__":
    main()

In [None]:
pip install nltk scikit-learn numpy

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
def read_transcript(file_path):
    with open(file_path, 'r') as file:
        transcript = file.read()
    return transcript

def split_into_sentences(transcript):
    return sent_tokenize(transcript)

def calculate_cosine_similarity(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

def group_sentences(sentences, cosine_sim, min_sentences=3, max_sentences=10):
    grouped_sentences = []
    used_indices = set()

    for i in range(len(sentences)):
        if i in used_indices:
            continue
        group = [sentences[i]]
        used_indices.add(i)
        for j in range(i + 1, len(sentences)):
            if j in used_indices:
                continue
            if cosine_sim[i][j] > 0.5:  # Adjust the threshold as needed
                group.append(sentences[j])
                used_indices.add(j)
                if len(group) >= max_sentences:
                    break
        if len(group) >= min_sentences:
            grouped_sentences.append(group)
    return grouped_sentences

def process_transcript(file_path):
    transcript = read_transcript(file_path)
    sentences = split_into_sentences(transcript)
    cosine_sim = calculate_cosine_similarity(sentences)
    grouped_sentences = group_sentences(sentences, cosine_sim)
    return grouped_sentences

In [None]:
file_path = '/content/transcript (5).txt'
grouped_sentences = process_transcript(file_path)

for i, group in enumerate(grouped_sentences):
    print(f"Segment {i + 1}:")
    for sentence in group:
        print(sentence)
    print("\n")

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Download NLTK data
nltk.download('punkt')

def read_transcript(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            transcript = file.read()
        return transcript
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def split_into_sentences(transcript):
    return sent_tokenize(transcript)

def calculate_cosine_similarity(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

def group_sentences(sentences, cosine_sim, min_sentences=3, max_sentences=10, similarity_threshold=0.5):
    grouped_sentences = []
    used_indices = set()

    for i in range(len(sentences)):
        if i in used_indices:
            continue
        group = [sentences[i]]
        used_indices.add(i)
        for j in range(i + 1, len(sentences)):
            if j in used_indices:
                continue
            if cosine_sim[i][j] > similarity_threshold:  # Adjust the threshold as needed
                group.append(sentences[j])
                used_indices.add(j)
                if len(group) >= max_sentences:
                    break
        if len(group) >= min_sentences:
            grouped_sentences.append(group)
    return grouped_sentences

def process_transcript(file_path):
    transcript = read_transcript(file_path)
    if transcript is None:
        return []  # Return an empty list if the file couldn't be read

    sentences = split_into_sentences(transcript)
    cosine_sim = calculate_cosine_similarity(sentences)
    grouped_sentences = group_sentences(sentences, cosine_sim, similarity_threshold=0.6)  # Adjusted threshold
    return grouped_sentences

# Replace with your actual file path
file_path = '/content/transcript (5).txt'
grouped_sentences = process_transcript(file_path)

if grouped_sentences:
    for i, group in enumerate(grouped_sentences):
        print(f"Segment {i + 1}:")
        for sentence in group:
            print(sentence)
        print("\n")
else:
    print("No sentences were processed. Check the file path and file content.")

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Download NLTK data
nltk.download('punkt')

def read_transcript(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            transcript = file.read()
        return transcript
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def split_into_sentences(transcript):
    return sent_tokenize(transcript)

def calculate_cosine_similarity(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

def group_sentences(sentences, cosine_sim, min_sentences=3, max_sentences=10, similarity_threshold=0.5):
    grouped_sentences = []
    used_indices = set()

    for i in range(len(sentences)):
        if i in used_indices:
            continue
        group = [sentences[i]]
        used_indices.add(i)
        for j in range(i + 1, len(sentences)):
            if j in used_indices:
                continue
            if cosine_sim[i][j] > similarity_threshold:  # Adjust the threshold as needed
                group.append(sentences[j])
                used_indices.add(j)
                if len(group) >= max_sentences:
                    break
        if len(group) >= min_sentences:
            grouped_sentences.append(group)
    return grouped_sentences

def process_transcript(file_path):
    transcript = read_transcript(file_path)
    if transcript is None:
        return []  # Return an empty list if the file couldn't be read

    sentences = split_into_sentences(transcript)
    cosine_sim = calculate_cosine_similarity(sentences)
    grouped_sentences = group_sentences(sentences, cosine_sim, similarity_threshold=0.6)  # Adjusted threshold
    return grouped_sentences

# Replace with your actual file path
file_path = '/content/transcript (5).txt'
grouped_sentences = process_transcript(file_path)

if grouped_sentences:
    for i, group in enumerate(grouped_sentences):
        print(f"Segment {i + 1}:")
        for sentence in group:
            print(sentence)
        print("\n")
else:
    print("No sentences were processed. Check the file path and file content.")

In [None]:
#Cosine Similarity approach

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Download NLTK data
nltk.download('punkt')

def read_transcript(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            transcript = file.read()
        return transcript
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def split_into_sentences(transcript):
    return sent_tokenize(transcript)

def calculate_cosine_similarity(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

def save_cosine_similarity_to_csv(cosine_sim, sentences, output_file):
    # Create a DataFrame for the cosine similarity matrix
    df = pd.DataFrame(cosine_sim, index=sentences, columns=sentences)

    # Save the DataFrame to a CSV file
    df.to_csv(output_file)
    print(f"Cosine similarity matrix saved to {output_file}")

def process_transcript(file_path, output_file):
    # Read the transcript
    transcript = read_transcript(file_path)
    if transcript is None:
        return

    # Split the transcript into sentences
    sentences = split_into_sentences(transcript)
    print("Sentences extracted:")
    for i, sentence in enumerate(sentences):
        print(f"{i + 1}: {sentence}")

    # Calculate cosine similarity between sentences
    cosine_sim = calculate_cosine_similarity(sentences)

    # Save the cosine similarity matrix to a CSV file
    save_cosine_similarity_to_csv(cosine_sim, sentences, output_file)

# File paths (adjust as needed)
file_path = '/content/transcript (5).txt'  # Path to your uploaded transcript file
output_file = '/content/cosine_similarity_matrix.csv'  # Output CSV file name

# Process the transcript and generate the CSV file
process_transcript(file_path, output_file)

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Download NLTK data
nltk.download('punkt')

def read_transcript(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            transcript = file.read()
        return transcript
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def split_into_sentences(transcript):
    return sent_tokenize(transcript)

def calculate_cosine_similarity(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

def save_cosine_similarity_to_csv(cosine_sim, sentences, output_file):
    # Create sentence names (S1, S2, S3, ...)
    sentence_names = [f"S{i+1}" for i in range(len(sentences))]

    # Create a DataFrame for the cosine similarity matrix
    df = pd.DataFrame(cosine_sim, index=sentence_names, columns=sentence_names)

    # Save the DataFrame to a CSV file
    df.to_csv(output_file)
    print(f"Cosine similarity matrix saved to {output_file}")

def process_transcript(file_path, output_file):
    # Read the transcript
    transcript = read_transcript(file_path)
    if transcript is None:
        return

    # Split the transcript into sentences
    sentences = split_into_sentences(transcript)
    print("Sentences extracted:")
    for i, sentence in enumerate(sentences):
        print(f"S{i + 1}: {sentence}")

    # Calculate cosine similarity between sentences
    cosine_sim = calculate_cosine_similarity(sentences)

    # Save the cosine similarity matrix to a CSV file
    save_cosine_similarity_to_csv(cosine_sim, sentences, output_file)

# File paths (adjust as needed)
file_path = '/content/transcript (5).txt'  # Path to your uploaded transcript file
output_file = '/content/cosine_similarity_matrix1.csv'  # Output CSV file name

# Process the transcript and generate the CSV file
process_transcript(file_path, output_file)

In [None]:
df = pd.read_csv('/content/cosine_similarity_matrix1.csv')
df

In [None]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the uploaded CSV file
file_path = '/content/cosine_similarity_matrix1.csv'
data = pd.read_csv(file_path)

# Set the first column as the index and remove the 'Unnamed: 0' column
data.set_index('Unnamed: 0', inplace=True)

# Generate the correlation matrix
correlation_matrix = data.corr()

# Print the correlation matrix summary
print("Correlation Matrix Summary:")
print(correlation_matrix.describe())

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Download NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
def read_transcript(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            transcript = file.read()
        return transcript
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def split_into_sentences(transcript):
    return sent_tokenize(transcript)

def calculate_cosine_similarity(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

def segment_sentences(sentences, cosine_sim, threshold=0.5):
    visited = [False] * len(sentences)
    segments = []

    for i in range(len(sentences)):
        if not visited[i]:
            segment = [sentences[i]]
            visited[i] = True

            # Check for similar sentences
            for j in range(i + 1, len(sentences)):
                if not visited[j] and cosine_sim[i][j] >= threshold:
                    segment.append(sentences[j])
                    visited[j] = True

            segments.append(segment)

    return segments

def print_segments(segments):
    for idx, segment in enumerate(segments, start=1):
        print(f"\nSegment {idx}:")
        for sentence in segment:
            print(f" - {sentence}")

def process_transcript(file_path, threshold=0.5):
    # Read the transcript
    transcript = read_transcript(file_path)
    if transcript is None:
        return

    # Split the transcript into sentences
    sentences = split_into_sentences(transcript)
    print("Sentences extracted:")
    for i, sentence in enumerate(sentences):
        print(f"S{i + 1}: {sentence}")

    # Calculate cosine similarity between sentences
    cosine_sim = calculate_cosine_similarity(sentences)

    # Segment sentences based on similarity
    segments = segment_sentences(sentences, cosine_sim, threshold)

    # Print segmented sentences
    print("\nSegmented Sentences:")
    print_segments(segments)

# File path to the transcript
file_path = '/content/transcript (8).txt'  # Replace with your file path

# Process the transcript and print segments with a similarity threshold of 0.5
process_transcript(file_path, threshold=0.1)

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Download NLTK data
nltk.download('punkt')

def read_transcript(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            transcript = file.read()
        return transcript
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def split_into_sentences(transcript):
    return sent_tokenize(transcript)

def calculate_cosine_similarity(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

def segment_sentences(sentences, cosine_sim, threshold=0.5):
    visited = [False] * len(sentences)
    segments = []

    for i in range(len(sentences)):
        if not visited[i]:
            segment = [sentences[i]]
            visited[i] = True

            # Check for similar sentences
            for j in range(i + 1, len(sentences)):
                if not visited[j] and cosine_sim[i][j] >= threshold:
                    segment.append(sentences[j])
                    visited[j] = True

            segments.append(segment)

    return segments

def print_segments(segments):
    for idx, segment in enumerate(segments, start=1):
        print(f"\nSegment {idx}:")
        for sentence in segment:
            print(f" - {sentence}")

# 🆕 New Function to Save Segments to a Text File
def save_segments_to_file(segments, output_file):
    try:
        with open(output_file, 'w', encoding='utf-8') as file:
            for idx, segment in enumerate(segments, start=1):
                file.write(f"Segment {idx}:\n")
                for sentence in segment:
                    file.write(f" - {sentence}\n")
                file.write("\n")
        print(f"Segmented output saved to {output_file}")
    except Exception as e:
        print(f"Error saving file: {e}")

def process_transcript(file_path, threshold=0.5, output_file='segmented_output.txt'):
    # Read the transcript
    transcript = read_transcript(file_path)
    if transcript is None:
        return

    # Split the transcript into sentences
    sentences = split_into_sentences(transcript)
    print("Sentences extracted:")
    for i, sentence in enumerate(sentences):
        print(f"S{i + 1}: {sentence}")

    # Calculate cosine similarity between sentences
    cosine_sim = calculate_cosine_similarity(sentences)

    # Segment sentences based on similarity
    segments = segment_sentences(sentences, cosine_sim, threshold)

    # Print segmented sentences
    print("\nSegmented Sentences:")
    print_segments(segments)

    # Save segments to a text file
    save_segments_to_file(segments, output_file)

# File paths
file_path = '/content/transcript (5).txt'  # Replace with your file path
output_file = '/content/segmented_output.txt'  # Output text file path

# Process the transcript, segment it, and save to a text file
process_transcript(file_path, threshold=0.15, output_file=output_file)

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

def read_transcript(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            transcript = file.read()
        return transcript
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def split_into_sentences(transcript):
    return sent_tokenize(transcript)

def calculate_cosine_similarity(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

def segment_sentences(sentences, cosine_sim, threshold=0.5):
    visited = [False] * len(sentences)
    segments = []
    for i in range(len(sentences)):
        if not visited[i]:
            segment = [sentences[i]]
            visited[i] = True
            for j in range(i + 1, len(sentences)):
                if not visited[j] and cosine_sim[i][j] >= threshold:
                    segment.append(sentences[j])
                    visited[j] = True
            segments.append(segment)
    return segments

def remove_stopwords(segment):
    stop_words = set(stopwords.words('english'))
    filtered_words = []
    for sentence in segment:
        words = word_tokenize(sentence)
        filtered_words.extend([word.lower() for word in words if word.isalnum() and word.lower() not in stop_words])
    return filtered_words

def find_keywords(filtered_words):
    if len(filtered_words) < 2:
        return None, None, 0  # Not enough words for comparison

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(filtered_words)
    cosine_sim = cosine_similarity(tfidf_matrix)

    max_sim = 0
    keyword_pair = (None, None)
    for i in range(len(filtered_words)):
        for j in range(i + 1, len(filtered_words)):
            if cosine_sim[i][j] > max_sim:
                max_sim = cosine_sim[i][j]
                keyword_pair = (filtered_words[i], filtered_words[j])
    return keyword_pair[0], keyword_pair[1], max_sim

def save_segments_to_csv(segments, output_file):
    with pd.ExcelWriter(output_file) as writer:
        for idx, segment in enumerate(segments, start=1):
            filtered_words = remove_stopwords(segment)
            word1, word2, max_sim = find_keywords(filtered_words)

            # Create a DataFrame for the segment
            df = pd.DataFrame({
                'Word 1': [word1] if word1 else [],
                'Word 2': [word2] if word2 else [],
                'Cosine Similarity': [max_sim] if word1 and word2 else [],
                'Keyword': [f"{word1}, {word2}"] if word1 and word2 else []
            })

            # Save each segment as a separate sheet
            df.to_excel(writer, sheet_name=f'Segment {idx}', index=False)

    print(f"Segmented keywords saved to {output_file}")

def process_transcript(file_path, threshold=0.5, output_file='segmented_keywords.xlsx'):
    transcript = read_transcript(file_path)
    if transcript is None:
        return

    sentences = split_into_sentences(transcript)
    cosine_sim = calculate_cosine_similarity(sentences)
    segments = segment_sentences(sentences, cosine_sim, threshold)

    save_segments_to_csv(segments, output_file)

    print("\nChosen Keywords for Each Segment:")
    for idx, segment in enumerate(segments, start=1):
        filtered_words = remove_stopwords(segment)
        word1, word2, max_sim = find_keywords(filtered_words)
        if word1 and word2:
            print(f"Segment {idx}: Keywords = {word1}, {word2} (Similarity: {max_sim:.2f})")
        else:
            print(f"Segment {idx}: Not enough data for keywords")

# File paths
file_path = '/content/transcript (5).txt'  # Replace with your file path
output_file = '/content/segmented_keywords.xlsx'  # Output Excel file path

# Run the process
process_transcript(file_path, threshold=0.1, output_file=output_file)


In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

def read_segmented_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            # Split the file content into segments based on a delimiter (e.g., "\n\n")
            segments = [segment.strip() for segment in content.split("\n\n") if segment.strip()]
        return segments
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def remove_stopwords(segment):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(segment)
    return [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

def calculate_word_cosine_similarity(filtered_words):
    if len(filtered_words) < 2:
        return None, None, 0, None  # Not enough words for comparison

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(filtered_words)
    cosine_sim = cosine_similarity(tfidf_matrix)

    max_sim = 0
    keyword_pair = (None, None)
    for i in range(len(filtered_words)):
        for j in range(i + 1, len(filtered_words)):
            if cosine_sim[i][j] > max_sim:
                max_sim = cosine_sim[i][j]
                keyword_pair = (filtered_words[i], filtered_words[j])

    return keyword_pair[0], keyword_pair[1], max_sim, cosine_sim

def save_word_similarity_to_excel(cosine_sim, filtered_words, sheet_name, writer):
    if cosine_sim is not None:
        df = pd.DataFrame(cosine_sim, index=filtered_words, columns=filtered_words)
        df.to_excel(writer, sheet_name=sheet_name)
    else:
        df = pd.DataFrame(columns=['Info'])
        df.loc[0] = ["Not enough words for comparison"]
        df.to_excel(writer, sheet_name=sheet_name, index=False)

def save_keywords_summary_to_csv(keywords, output_file):
    summary_df = pd.DataFrame(keywords, columns=['Segment', 'Word 1', 'Word 2', 'Cosine Similarity', 'Keyword'])
    summary_df.to_csv(output_file, index=False)
    print(f"Keywords summary saved to {output_file}")

def process_segments(file_path, output_excel='segment_word_similarity.xlsx', summary_csv='segment_keywords_summary.csv'):
    segments = read_segmented_file(file_path)
    if segments is None:
        return

    keywords_summary = []

    with pd.ExcelWriter(output_excel) as writer:
        for idx, segment in enumerate(segments, start=1):
            # Remove stopwords
            filtered_words = remove_stopwords(segment)

            # Calculate cosine similarity between words
            word1, word2, max_sim, cosine_sim = calculate_word_cosine_similarity(filtered_words)

            # Save word similarity matrix for each segment to Excel
            save_word_similarity_to_excel(cosine_sim, filtered_words, f'Segment {idx}', writer)

            # Prepare summary data
            if word1 and word2:
                keyword = f"{word1}, {word2}"
                keywords_summary.append([f"Segment {idx}", word1, word2, max_sim, keyword])
            else:
                keywords_summary.append([f"Segment {idx}", "N/A", "N/A", 0, "N/A"])

    # Save summary of keywords to CSV
    save_keywords_summary_to_csv(keywords_summary, summary_csv)

    # Print chosen keywords for each segment
    print("\nChosen Keywords for Each Segment:")
    for row in keywords_summary:
        print(f"{row[0]}: Keywords = {row[4]} (Similarity: {row[3]:.2f})")

# File paths
segmented_file_path = '/content/segmented_output.txt'  # Replace with your segmented text file path
output_excel = '/content/segment_word_similarity.xlsx'  # Excel file for word similarity matrices
summary_csv = '/content/segment_keywords_summary.csv'  # CSV file for keywords summary

# Run the process
process_segments(segmented_file_path, output_excel=output_excel, summary_csv=summary_csv)


In [None]:
import nltk
from nltk.tokenize import word_tokenize
import re
import pandas as pd

# Download NLTK data if not already available
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# Define filler words to remove
FILLER_WORDS = {'umm', 'uh', 'oh', 'okay', 'like', 'you know', 'actually', 'basically', 'literally', 'well', 'so', 'just', 'i mean', 'sort of', 'kind of'}

def read_segmented_file(file_path):
    """Read segmented text file and split into segments."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            # Split the file content into segments based on double newlines
            segments = [segment.strip() for segment in content.split("\n\n") if segment.strip()]
        return segments
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def clean_text(segment):
    """Remove stopwords, fillers, and dates from a segment."""
    stop_words = set(stopwords.words('english'))

    # Remove dates (e.g., 12/03/2023, March 12, 2023)
    segment = re.sub(r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b\s\d{1,2},?\s?\d{2,4}?)\b', '', segment)

    # Tokenize and clean words
    words = word_tokenize(segment)
    filtered_words = [
        word.lower() for word in words
        if word.isalnum() and word.lower() not in stop_words and word.lower() not in FILLER_WORDS
    ]
    return ' '.join(filtered_words)

def save_to_csv(segments, cleaned_segments, output_file):
    """Save original and cleaned segments to a CSV file."""
    df = pd.DataFrame({
        'Original Segment': segments,
        'Cleaned Segment (No Stopwords or Fillers)': cleaned_segments
    })
    df.to_csv(output_file, index=False)
    print(f"Cleaned segments saved to {output_file}")

def process_segments(file_path, output_csv='cleaned_segments.csv'):
    """Process the segmented text file."""
    segments = read_segmented_file(file_path)
    if segments is None:
        return

    cleaned_segments = [clean_text(segment) for segment in segments]
    save_to_csv(segments, cleaned_segments, output_csv)

    # Print a preview of cleaned segments
    print("\nPreview of Cleaned Segments:")
    for idx, (orig, clean) in enumerate(zip(segments, cleaned_segments), start=1):
        print(f"\nSegment {idx} (Original): {orig}")
        print(f"Segment {idx} (Cleaned): {clean}")

# File paths
segmented_file_path = '/content/segmented_output.txt'  # Replace with your segmented text file path
output_csv = '/content/cleaned_segments.csv'  # CSV file for cleaned segments

# Run the process
process_segments(segmented_file_path, output_csv=output_csv)


In [None]:
import json
import re

def load_transcript_from_file(file_path):
    """
    Loads the transcript from a file.
    Args:
        file_path (str): Path to the transcript file.
    Returns:
        list: List of dictionaries with 'start', 'end', and 'text' keys.
    """
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            if file_path.endswith(".json"):
                transcript = json.load(file)
            else:
                # For plain text files, assume each line is in the format: [start] - [end]: [text]
                transcript = []
                for line in file:
                    match = re.match(r"(\d+\.\d+) - (\d+\.\d+): (.+)", line.strip())
                    if match:
                        start, end, text = match.groups()
                        transcript.append({
                            "start": float(start),
                            "end": float(end),
                            "text": text
                        })
            return transcript
    except Exception as e:
        print(f"Error loading transcript: {e}")
        return None

def save_transcript_with_timestamps(transcript, output_path="transcript_with_timestamps.txt"):
    """
    Saves the transcript with timestamps to a text file.
    Args:
        transcript (list): List of dictionaries with 'start', 'end', and 'text'.
        output_path (str): The path to save the output file.
    """
    with open(output_path, "w", encoding="utf-8") as file:
        for segment in transcript:
            file.write(f"{segment['start']} - {segment['end']}: {segment['text']}\n")
    print(f"Transcript with timestamps saved to {output_path}")

# Example usage
if __name__ == "__main__":
    # Ask the user for the transcript file path
    transcript_path = input("Enter the path to the transcript file: ")
    output_path = input("Enter the path to save the transcript with timestamps (default: transcript_with_timestamps.txt): ") or "transcript_with_timestamps.txt"

    # Load the transcript from the file
    transcript = load_transcript_from_file(transcript_path)
    if not transcript:
        print("Failed to load transcript. Exiting.")
        exit()

    # Print the transcript with timestamps
    print("\nTranscript with Timestamps:")
    for segment in transcript:
        print(f"{segment['start']} - {segment['end']}: {segment['text']}")

    # Save the transcript with timestamps to a file
    save_transcript_with_timestamps(transcript, output_path)

In [None]:
import re
import urllib.parse
import requests
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
import speech_recognition as sr
from pydub import AudioSegment
import os
import yt_dlp
def extract_video_id(video_url):
    """
    Extracts the YouTube video ID from various URL formats.
    """
    parsed_url = urllib.parse.urlparse(video_url)
    query_params = urllib.parse.parse_qs(parsed_url.query)

    if "v" in query_params:
        return query_params["v"][0]

    match = re.search(r"(youtu\.be/|youtube\.com/embed/|youtube\.com/shorts/)([\w-]+)", video_url)
    if match:
        return match.group(2)

    return None

def download_audio(video_url):
    """
    Downloads the audio using yt-dlp with cookies and returns the file path.
    """
    try:
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': 'audio.%(ext)s',
            'cookiefile': '/content/cookies (2).txt',  # Use the exported cookies
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(video_url, download=True)
            return "audio.mp3"
    except Exception as e:
        return f"Error downloading audio: {str(e)}"

def convert_audio_to_wav(audio_file):
    """
    Converts the downloaded MP3 audio to WAV format using pydub.
    """
    wav_file = "audio.wav"
    try:
        AudioSegment.from_mp3(audio_file).export(wav_file, format="wav")
        return wav_file
    except Exception as e:
        return f"Error converting to WAV: {str(e)}"

def transcribe_audio(audio_path, chunk_length=30):
    """
    Splits audio into smaller chunks and transcribes each chunk separately.
    Args:
        audio_path (str): Path to the audio file.
        chunk_length (int): Length of each chunk in seconds (default: 30).
    Returns:
        list: List of dictionaries containing transcribed text and timestamps.
    """
    recognizer = sr.Recognizer()
    audio = AudioSegment.from_wav(audio_path)
    total_duration = len(audio) / 1000  # Convert to seconds
    transcribed_segments = []

    print("Transcribing audio in chunks...")

    # Split and transcribe audio in chunks
    for start in range(0, int(total_duration), chunk_length):
        end = min(start + chunk_length, int(total_duration))
        chunk = audio[start * 1000:end * 1000]  # Extract chunk in milliseconds
        chunk.export("chunk.wav", format="wav")  # Save chunk temporarily

        with sr.AudioFile("chunk.wav") as source:
            try:
                audio_data = recognizer.record(source)
                text = recognizer.recognize_google(audio_data)
                transcribed_segments.append({
                    "start": start,
                    "end": end,
                    "text": text
                })
            except sr.UnknownValueError:
                transcribed_segments.append({
                    "start": start,
                    "end": end,
                    "text": "[Unintelligible]"
                })
            except sr.RequestError as e:
                return f"Error with the speech recognition service: {str(e)}"

    os.remove("chunk.wav")  # Clean up temporary chunk file
    return transcribed_segments

def get_transcript_unlisted(video_url):
    """
    Tries to fetch the transcript using youtube_transcript_api first,
    then falls back to downloading and transcribing audio if necessary.
    """
    video_id = extract_video_id(video_url)
    if not video_id:
        return "Invalid YouTube URL."

    # Try to fetch transcript using youtube_transcript_api
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        # Add 'end' time to each segment
        for segment in transcript:
            segment["end"] = segment["start"] + segment["duration"]
        return transcript  # Return transcript with timestamps
    except:
        print("Transcript not available via API, attempting audio transcription...")

    # Download and transcribe audio if no transcript is available
    audio_file = download_audio(video_url)
    if "Error" in audio_file:
        return audio_file

    wav_file = convert_audio_to_wav(audio_file)
    if "Error" in wav_file:
        return wav_file

    transcription = transcribe_audio(wav_file)

    # Cleanup temporary files
    os.remove(audio_file)
    os.remove(wav_file)

    return transcription

def save_transcript_to_file(transcript, filename="transcript.txt"):
    """
    Saves the transcript to a text file.
    Args:
        transcript (list or str): The transcript to save.
        filename (str): The name of the output file.
    """
    with open(filename, "w", encoding="utf-8") as file:
        if isinstance(transcript, list):
            for segment in transcript:
                file.write(f"{segment['start']} - {segment['end']}: {segment['text']}\n")
        else:
            file.write(transcript)
    print(f"Transcript saved to {filename}")

# Example usage
if __name__ == "__main__":
    video_url = input("Enter the YouTube video URL: ")
    transcript = get_transcript_unlisted(video_url)

    if isinstance(transcript, list):
        print("\nTranscript with Timestamps:")
        for segment in transcript:
            print(f"{segment['start']} - {segment['end']}: {segment['text']}")
    else:
        print("\nTranscript:\n", transcript)

    # Save transcript to a text file
    save_transcript_to_file(transcript, "transcript.txt")

In [None]:
pip install yt-dlp

In [None]:
pip install youtube-transcript-api

In [None]:
pip install SpeechRecognition

In [None]:
pip install pydub

In [None]:
pip install pytube

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Download NLTK data
nltk.download('punkt')

def read_transcript(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            transcript = file.read()
        return transcript
    except Exception as e:
        print(f"Error reading file: {e}")
        return None

def split_into_sentences(transcript):
    return sent_tokenize(transcript)

def calculate_cosine_similarity(sentences):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(sentences)
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

def segment_sentences(sentences, cosine_sim, threshold=0.5, min_sentences=5):
    visited = [False] * len(sentences)
    segments = []

    for i in range(len(sentences)):
        if not visited[i]:
            segment = [sentences[i]]
            visited[i] = True

            for j in range(i + 1, len(sentences)):
                if not visited[j] and cosine_sim[i][j] >= threshold:
                    segment.append(sentences[j])
                    visited[j] = True

            segments.append(segment)

    # Merge smaller segments
    merged_segments = []
    temp_segment = []

    for segment in segments:
        temp_segment.extend(segment)
        if len(temp_segment) >= min_sentences:
            merged_segments.append(temp_segment)
            temp_segment = []

    if temp_segment:
        if merged_segments:
            merged_segments[-1].extend(temp_segment)
        else:
            merged_segments.append(temp_segment)

    return merged_segments

def print_segments(segments):
    for idx, segment in enumerate(segments, start=1):
        print(f"\nSegment {idx}:")
        for sentence in segment:
            print(f" - {sentence}")

def save_segments_to_file(segments, output_file):
    try:
        with open(output_file, 'w', encoding='utf-8') as file:
            for idx, segment in enumerate(segments, start=1):
                file.write(f"Segment {idx}:\n")
                for sentence in segment:
                    file.write(f" - {sentence}\n")
                file.write("\n")
        print(f"Segmented output saved to {output_file}")
    except Exception as e:
        print(f"Error saving file: {e}")

def process_transcript(file_path, threshold=0.5, min_sentences=5, output_file='segmented_output.txt'):
    transcript = read_transcript(file_path)
    if transcript is None:
        return

    sentences = split_into_sentences(transcript)
    print("Sentences extracted:")
    for i, sentence in enumerate(sentences):
        print(f"S{i + 1}: {sentence}")

    cosine_sim = calculate_cosine_similarity(sentences)

    segments = segment_sentences(sentences, cosine_sim, threshold, min_sentences)

    print("\nSegmented Sentences:")
    print_segments(segments)

    save_segments_to_file(segments, output_file)

file_path = '/content/transcript.txt'  # Replace with your file path
output_file = '/content/segmented_output.txt'
process_transcript(file_path, threshold=0.15, min_sentences=5, output_file=output_file)
