<a href="https://colab.research.google.com/github/Kavinass004/Heart-disease-/blob/main/final_year_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required packages
!pip install -q pyannote.audio
!pip install -q torch torchaudio
!pip install -q transformers accelerate bitsandbytes
!pip install -q librosa soundfile
!pip install -q pydub

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/117.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m898.7/898.7 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m818.9/818.9 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.5/58.5 kB[0m [31m4.8 MB/s[0m eta [36m0:00:0

In [None]:
# AI News Company - Complete Pipeline
# This notebook combines speech-to-text conversion with speaker diarization and content generation
# All files are stored locally in the Colab environment

import os
import json
import torch
import numpy as np
import librosa
import soundfile as sf
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForCausalLM, AutoTokenizer
import warnings
warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output

# Create directories for storing files
os.makedirs("/content/input", exist_ok=True)
os.makedirs("/content/output", exist_ok=True)

# Authentication for Hugging Face models
from huggingface_hub import login
print("Please enter your Hugging Face token when prompted...")
login()  # Enter your Hugging Face token when prompted

# Function to upload files to Colab
from google.colab import files

def upload_file_to_colab():
    """
    Upload a file to Colab environment

    Returns:
    - Path to the uploaded file
    """
    print("Please upload your audio or video file...")
    uploaded = files.upload()

    if not uploaded:
        raise ValueError("No file was uploaded!")

    filename = list(uploaded.keys())[0]
    filepath = f"/content/input/{filename}"

    # Save the uploaded file
    with open(filepath, 'wb') as f:
        f.write(uploaded[filename])

    print(f"File saved to {filepath}")
    return filepath

#######################
# STEP 1: Speech-to-Text Conversion with Speaker Diarization
#######################

def transcribe_with_speaker_diarization(audio_path, output_path, use_smaller_model=False):
    """
    Transcribe audio file with speaker diarization for code-mixed Tamil

    Parameters:
    - audio_path: Path to the audio file
    - output_path: Path to save the transcript
    - use_smaller_model: If True, use a smaller Whisper model to save memory
    """
    try:
        from pyannote.audio import Pipeline

        print("Loading speaker diarization model...")
        # Initialize the speaker diarization pipeline
        diarization_pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization@2.1",
            use_auth_token=True  # Uses the token from huggingface_hub login
        )

        print("Running speaker diarization...")
        # Run speaker diarization on the audio file
        diarization_result = diarization_pipeline(audio_path)

        print("Loading Whisper model...")
        # Load Whisper model - using appropriate size based on memory constraints
        model_size = "small" if use_smaller_model else "large-v3"
        processor = WhisperProcessor.from_pretrained(f"openai/whisper-{model_size}")
        model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{model_size}")

        # Set language to Tamil
        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="tamil", task="transcribe")

        # Load audio file
        audio, sr = librosa.load(audio_path, sr=16000)

        # Create a dictionary to store segments for each speaker
        speaker_segments = {}

        # Process diarization result
        for turn, _, speaker in diarization_result.itertracks(yield_label=True):
            # Extract audio segment for the current speaker
            start_sample = int(turn.start * sr)
            end_sample = int(turn.end * sr)

            # Skip invalid segments
            if start_sample >= end_sample or start_sample >= len(audio) or end_sample > len(audio):
                continue

            segment = audio[start_sample:end_sample]

            # Append segment to the speaker's dictionary
            if speaker not in speaker_segments:
                speaker_segments[speaker] = []
            speaker_segments[speaker].append({
                "start": turn.start,
                "end": turn.end,
                "audio": segment
            })

        # Transcribe for each speaker
        speaker_transcripts = {}
        for speaker, segments in speaker_segments.items():
            speaker_transcripts[speaker] = []

            for segment in segments:
                # Skip segments that are too short
                if len(segment["audio"]) < 1000:  # About 60ms at 16kHz
                    continue

                # Convert audio segment to feature
                input_features = processor(
                    segment["audio"],
                    sampling_rate=sr,
                    return_tensors="pt"
                ).input_features

                # Generate token ids
                with torch.no_grad():
                    predicted_ids = model.generate(input_features, max_length=256)

                # Decode the token ids to text
                transcription = processor.batch_decode(
                    predicted_ids,
                    skip_special_tokens=True,
                    normalize=True
                )[0]

                # Skip empty transcriptions
                if not transcription.strip():
                    continue

                speaker_transcripts[speaker].append({
                    "start": segment["start"],
                    "end": segment["end"],
                    "text": transcription
                })

        # Write transcripts to file
        with open(output_path, 'w', encoding='utf-8') as f:
            for speaker, transcripts in speaker_transcripts.items():
                f.write(f"Speaker {speaker}:\n")
                for transcript in transcripts:
                    f.write(f"[{transcript['start']:.2f} - {transcript['end']:.2f}] {transcript['text']}\n")
                f.write("\n")

        print(f"Transcript saved to {output_path}")
        return speaker_transcripts

    except Exception as e:
        print(f"Error in transcribe_with_speaker_diarization: {str(e)}")
        # Fall back to basic transcription without diarization
        return basic_transcription(audio_path, output_path, use_smaller_model)

def basic_transcription(audio_path, output_path, use_smaller_model=False):
    """
    Perform basic transcription without speaker diarization as a fallback
    """
    print("Falling back to basic transcription without speaker diarization...")

    try:
        # Load Whisper model
        model_size = "small" if use_smaller_model else "large-v3"
        processor = WhisperProcessor.from_pretrained(f"openai/whisper-{model_size}")
        model = WhisperForConditionalGeneration.from_pretrained(f"openai/whisper-{model_size}")

        # Set language to Tamil
        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="tamil", task="transcribe")

        # Load audio
        audio, sr = librosa.load(audio_path, sr=16000)

        # Process in chunks to avoid memory issues
        chunk_length_s = 30  # Process 30 seconds at a time
        chunk_length = chunk_length_s * sr

        chunks = [audio[i:i+chunk_length] for i in range(0, len(audio), chunk_length)]

        full_transcript = []

        for i, chunk in enumerate(chunks):
            # Convert audio to features
            input_features = processor(
                chunk,
                sampling_rate=sr,
                return_tensors="pt"
            ).input_features

            # Generate token ids
            with torch.no_grad():
                predicted_ids = model.generate(input_features, max_length=256)

            # Decode the token ids to text
            transcription = processor.batch_decode(
                predicted_ids,
                skip_special_tokens=True,
                normalize=True
            )[0]

            start_time = i * chunk_length_s
            end_time = min((i + 1) * chunk_length_s, len(audio) / sr)

            full_transcript.append({
                "start": start_time,
                "end": end_time,
                "text": transcription
            })

        # Create a simple speaker transcript with all text assigned to one speaker
        speaker_transcripts = {"UNKNOWN": full_transcript}

        # Write transcript to file
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write("Speaker UNKNOWN:\n")
            for transcript in full_transcript:
                f.write(f"[{transcript['start']:.2f} - {transcript['end']:.2f}] {transcript['text']}\n")

        print(f"Basic transcript saved to {output_path}")
        return speaker_transcripts

    except Exception as e:
        print(f"Error in basic_transcription: {str(e)}")
        # Create a minimal transcript to allow the pipeline to continue
        speaker_transcripts = {"UNKNOWN": [{"start": 0, "end": 1, "text": "Transcription failed."}]}

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write("Speaker UNKNOWN:\n")
            f.write("[0.00 - 1.00] Transcription failed. Please check the audio file.")

        return speaker_transcripts

def process_video_input(video_path, output_path, use_smaller_model=False):
    """
    Extract audio from video and transcribe with speaker diarization
    """
    try:
        # Extract audio from video
        audio_path = video_path.rsplit('.', 1)[0] + '.wav'

        print("Extracting audio from video...")
        # Use ffmpeg to extract audio
        !ffmpeg -i "{video_path}" -vn -acodec pcm_s16le -ar 16000 -ac 1 "{audio_path}" -y -loglevel error

        if not os.path.exists(audio_path) or os.path.getsize(audio_path) == 0:
            raise Exception("Failed to extract audio from video")

        # Transcribe the extracted audio
        speaker_transcripts = transcribe_with_speaker_diarization(audio_path, output_path, use_smaller_model)

        return speaker_transcripts

    except Exception as e:
        print(f"Error processing video: {str(e)}")
        # Create a minimal transcript to allow the pipeline to continue
        speaker_transcripts = {"UNKNOWN": [{"start": 0, "end": 1, "text": "Video processing failed."}]}

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write("Speaker UNKNOWN:\n")
            f.write("[0.00 - 1.00] Video processing failed. Please check the video file.")

        return speaker_transcripts

#######################
# STEP 2: Content Generation using Open Source LLM
#######################

def generate_news_content(transcript, speaker_names=None, use_smaller_model=False):
    """
    Generate news content based on the transcript

    Parameters:
    - transcript: Dictionary of speaker transcripts
    - speaker_names: Dictionary mapping speaker IDs to names
    - use_smaller_model: If True, use a smaller LLM to save memory
    """
    try:
        print("Loading LLM model...")
        # Choose model based on memory constraints
        if use_smaller_model:
            model_name = "mistralai/Mistral-7B-Instruct-v0.2"
            load_in_4bit = True
        else:
            model_name = "mistralai/Mistral-7B-Instruct-v0.2"
            load_in_4bit = True  # Always use 4-bit quantization for efficiency

        # Load tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            load_in_4bit=load_in_4bit
        )

        # If speaker names are not provided, ask for them
        if not speaker_names:
            speaker_names = {}
            print("\nEnter names for each speaker (or press Enter to use default):")
            for speaker in transcript.keys():
                name = input(f"Enter the name for Speaker {speaker}: ").strip()
                speaker_names[speaker] = name if name else f"Speaker {speaker}"

        # Format the transcript for the model
        formatted_transcript = ""
        for speaker, segments in transcript.items():
            speaker_name = speaker_names.get(speaker, f"Speaker {speaker}")
            for segment in segments:
                formatted_transcript += f"{speaker_name}: {segment['text']}\n"

        # Truncate transcript if it's too long
        if len(formatted_transcript) > 4000:
            print("Transcript is too long. Truncating to 4000 characters...")
            formatted_transcript = formatted_transcript[:4000] + "\n[Transcript truncated due to length]"

        generated_content = {}

        # Generate newspaper article
        print("Generating newspaper article...")
        article_prompt = (
            f"<s>[INST] You are a professional journalist. Write a formal newspaper article based on the following interview transcript. "
            f"The article should be in code-mixed Tamil (mix of Tamil and English), formatted properly with a headline, "
            f"introduction, body, and conclusion. Make it informative and engaging.\n\n"
            f"TRANSCRIPT:\n{formatted_transcript}\n\n"
            f"Write the newspaper article: [/INST]"
        )

        try:
            inputs = tokenizer(article_prompt, return_tensors="pt").to(model.device)
            outputs = model.generate(
                **inputs,
                max_new_tokens=1024,
                temperature=0.7,
                top_p=0.9,
                do_sample=True
            )
            newspaper_article = tokenizer.decode(outputs[0], skip_special_tokens=True)
            newspaper_article = newspaper_article.split("[/INST]")[-1].strip()
            generated_content["newspaper_article"] = newspaper_article
        except Exception as e:
            print(f"Error generating newspaper article: {str(e)}")
            generated_content["newspaper_article"] = "Failed to generate newspaper article."

        # Generate social media bite
        print("Generating social media post...")
        social_prompt = (
            f"<s>[INST] You are a social media content creator for a news channel. Write a short, engaging social media post "
            f"(around 280 characters) based on the following interview transcript. The post should be in code-mixed Tamil "
            f"(mix of Tamil and English) and should capture the essence of the interview.\n\n"
            f"TRANSCRIPT:\n{formatted_transcript}\n\n"
            f"Write the social media post: [/INST]"
        )

        try:
            inputs = tokenizer(social_prompt, return_tensors="pt").to(model.device)
            outputs = model.generate(
                **inputs,
                max_new_tokens=300,
                temperature=0.8,
                top_p=0.9,
                do_sample=True
            )
            social_media_bite = tokenizer.decode(outputs[0], skip_special_tokens=True)
            social_media_bite = social_media_bite.split("[/INST]")[-1].strip()
            generated_content["social_media_bite"] = social_media_bite
        except Exception as e:
            print(f"Error generating social media post: {str(e)}")
            generated_content["social_media_bite"] = "Failed to generate social media post."
        print(f"Transcript length: {len(formatted_transcript)} characters")
        print(f"Sample transcript: {formatted_transcript[:500]}...")

        # Generate news reader script
        print("Generating news reader script...")
        script_prompt = (
            f"<s>[INST] You are a script writer for a news channel. Write a script for news readers based on the following "
            f"interview transcript. The script should be in code-mixed Tamil (mix of Tamil and English) and should include "
            f"prompts for two news readers (Anchor 1 and Anchor 2) to read alternately.\n\n"
            f"TRANSCRIPT:\n{formatted_transcript}\n\n"
            f"Write the news reader script: [/INST]"
        )

        try:
            inputs = tokenizer(script_prompt, return_tensors="pt").to(model.device)
            outputs = model.generate(
                **inputs,
                max_new_tokens=1024,
                temperature=0.7,
                top_p=0.9,
                do_sample=True
            )
            news_reader_script = tokenizer.decode(outputs[0], skip_special_tokens=True)
            news_reader_script = news_reader_script.split("[/INST]")[-1].strip()
            generated_content["news_reader_script"] = news_reader_script
        except Exception as e:
            print(f"Error generating news reader script: {str(e)}")
            generated_content["news_reader_script"] = "Failed to generate news reader script."

        print("Content generation completed successfully.")
        return generated_content, speaker_names

    except Exception as e:
        print(f"Error in generate_news_content: {str(e)}")
        # Return minimal content to allow the pipeline to continue
        return {
            "newspaper_article": "Content generation failed. Please check the model and transcript.",
            "social_media_bite": "Content generation failed.",
            "news_reader_script": "Content generation failed."
        }, speaker_names or {"UNKNOWN": "Unknown Speaker"}

def save_generated_content(generated_content, output_path):
    """
    Save generated content to files
    """
    try:
        # Create output directory if it doesn't exist
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        # Save newspaper article
        article_path = f"{output_path}_article.txt"
        with open(article_path, 'w', encoding='utf-8') as f:
            f.write(generated_content["newspaper_article"])

        # Save social media bite
        social_path = f"{output_path}_social.txt"
        with open(social_path, 'w', encoding='utf-8') as f:
            f.write(generated_content["social_media_bite"])

        # Save news reader script
        script_path = f"{output_path}_script.txt"
        with open(script_path, 'w', encoding='utf-8') as f:
            f.write(generated_content["news_reader_script"])

        print(f"Generated content saved to {output_path}_*.txt")
        return True

    except Exception as e:
        print(f"Error saving generated content: {str(e)}")
        return False

# Function to download files from Colab
def download_generated_files(base_path):
    """
    Download generated files from Colab

    Parameters:
    - base_path: Base path of the generated files
    """
    try:
        print("Downloading generated files...")
        files_to_download = [
            f"{base_path}_article.txt",
            f"{base_path}_social.txt",
            f"{base_path}_script.txt",
            f"{base_path}_speakers.json"
        ]

        for file_path in files_to_download:
            if os.path.exists(file_path):
                files.download(file_path)
                print(f"Downloaded {file_path}")
            else:
                print(f"File {file_path} not found")

    except Exception as e:
        print(f"Error downloading files: {str(e)}")
        print("You can manually download the files from the Colab file browser.")

#######################
# Main Function to Run the Complete Pipeline
#######################

def run_ai_news_pipeline(use_smaller_models=False):
    """
    Complete pipeline for AI News Company

    Parameters:
    - use_smaller_models: If True, use smaller models to save memory
    """
    try:
        print("\n======= AI News Company Pipeline =======")
        print("Step 1: Speech-to-Text Conversion with Speaker Diarization")

        # Upload input file
        input_path = upload_file_to_colab()
        transcript_path = "/content/output/transcript.txt"

        # Process input file
        if input_path.lower().endswith(('.mp4', '.avi', '.mov', '.mkv')):
            print("Processing video input...")
            speaker_transcripts = process_video_input(input_path, transcript_path, use_smaller_models)
        else:
            print("Processing audio input...")
            speaker_transcripts = transcribe_with_speaker_diarization(input_path, transcript_path, use_smaller_models)

        # Check if transcription succeeded
        if not speaker_transcripts or all(len(segments) == 0 for speaker, segments in speaker_transcripts.items()):
            print("Transcription failed or produced empty results. Please check your audio/video file.")
            return

        print("\nStep 2: Content Generation")
        # Generate content
        output_path = "/content/output/news_content"
        generated_content, speaker_names = generate_news_content(speaker_transcripts, use_smaller_model=use_smaller_models)

        # Save generated content
        success = save_generated_content(generated_content, output_path)

        if success:
            # Save speaker names for future reference
            speaker_names_path = f"{output_path}_speakers.json"
            with open(speaker_names_path, 'w', encoding='utf-8') as f:
                json.dump(speaker_names, f, ensure_ascii=False, indent=2)

            print("\n======= Pipeline Completed Successfully! =======")
            print(f"Transcript saved to: {transcript_path}")
            print(f"Newspaper article saved to: {output_path}_article.txt")
            print(f"Social media content saved to: {output_path}_social.txt")
            print(f"News reader script saved to: {output_path}_script.txt")
            print(f"Speaker names saved to: {output_path}_speakers.json")

            # Download all generated files
            download_generated_files(output_path)
            files.download(transcript_path)
        else:
            print("\n======= Pipeline Completed with Errors =======")
            print("Some files may have been generated. Check the Colab file browser.")

    except Exception as e:
        print(f"\nError running the pipeline: {str(e)}")
        print("Pipeline execution failed. Please check the error messages above.")

# Function to start the pipeline with memory management options
def start_pipeline():
    """
    Start the AI News Company pipeline with options for memory management
    """
    print("AI News Company Pipeline Startup Options:")
    print("1. Run with full models (best quality, requires more memory)")
    print("2. Run with smaller models (better performance, less memory usage)")

    choice = input("Enter your choice (1 or 2): ").strip()

    use_smaller_models = (choice == "2")

    if use_smaller_models:
        print("\nRunning with smaller models for better performance...")
    else:
        print("\nRunning with full models for best quality...")

    run_ai_news_pipeline(use_smaller_models=use_smaller_models)

# Run the pipeline with memory management options
start_pipeline()

Please enter your Hugging Face token when prompted...


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

AI News Company Pipeline Startup Options:
1. Run with full models (best quality, requires more memory)
2. Run with smaller models (better performance, less memory usage)
Enter your choice (1 or 2): 2

Running with smaller models for better performance...

Step 1: Speech-to-Text Conversion with Speaker Diarization
Please upload your audio or video file...


Saving WhatsApp Video 2024-12-22 at 13.10.58_7a7c5279.mp4 to WhatsApp Video 2024-12-22 at 13.10.58_7a7c5279 (1).mp4
File saved to /content/input/WhatsApp Video 2024-12-22 at 13.10.58_7a7c5279 (1).mp4
Processing video input...
Extracting audio from video...
Loading speaker diarization model...
Error in transcribe_with_speaker_diarization: Token is required (`token=True`), but no token found. You need to provide a token or be logged in to Hugging Face with `huggingface-cli login` or `huggingface_hub.login`. See https://huggingface.co/settings/tokens.
Falling back to basic transcription without speaker diarization...


preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

Basic transcript saved to /content/output/transcript.txt

Step 2: Content Generation
Loading LLM model...
Error in generate_news_content: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2.
401 Client Error. (Request ID: Root=1-67ebd856-09888e2213f88eb86bca0068;dcb347d8-2507-4469-9fd1-68c11de261aa)

Cannot access gated repo for url https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/resolve/main/config.json.
Access to model mistralai/Mistral-7B-Instruct-v0.2 is restricted. You must have access to it and be authenticated to access it. Please log in.
Generated content saved to /content/output/news_content_*.txt

Transcript saved to: /content/output/transcript.txt
Newspaper article saved to: /content/output/news_content_article.txt
Social media content saved to: /content/output/news_content_social.txt
News reader script saved to: /content/output/news_content_script.txt
Speaker names saved to: /content/ou

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/output/news_content_article.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/output/news_content_social.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/output/news_content_script.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/output/news_content_speakers.json


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>