In [None]:
# ============== #
#    READ ME     #
# ============== #

# --- Objective --- #
# Our goal was to create a project that enhances the media consumption experience for everyone, including individuals with impairments. 
# By providing automated solutions like image descriptions and video transcripts,we aimed to make media more accessible and easier to engage with. 
# Whether for general convenience or accessibility needs,this project seeks to bridge gaps and improve how people interact with digital content.

# --- Keep in mind --- #
# There should be a dataset in the notebook created in order to test the AI called "DATASETS"
# Said dataset are linked and called in all code cells, but the last one. 


##### ---------------------------------- ##### 
# ++  In each of the cells we have ++ #
##### ---------------------------------- ##### 
#    Text summurizer, image caption, audio trascript  # 
#           Video transcript and summurizer        #
#             A media analyzer Web cell        #
## >> Which does all of the above with files that can be uploaded ##

# ----  Key features:  ----  #
# >Document Summarization: Automatically extracts text from PDFs and generates summaries using transformers, improving efficiency for large text processing.
# >Image Captioning: Uses BLIP for generating accurate descriptions of images, beneficial for content tagging and accessibility.
# >Video Transcription: Extracts audio from videos and uses Whisper for transcription, enabling quick access to video content.
# >Video Summarization: Breaks down videos into key frames, generates captions, and creates an overall summary, efficiently capturing video content.

# The system adapts to various content types, making it useful in fields like research, media, and education. 
# It integrates generative AI to streamline tasks and improve productivity.




In [None]:

# ==================== #
#   Text Summarizer    #
# ===================  #



# Install necessary libraries
!pip install PyPDF2 transformers > /dev/null 2>&1

# Import libraries
import os
import PyPDF2
from transformers import pipeline
import torch

# Ensure proper parallelism settings
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Ensure CUDA debugging is enabled
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Function to summarize text using Hugging Face's transformers
def summarize_text(text, max_length=130, min_length=30):
    try:
        # Try to load the model on GPU if available
        device = 0 if torch.cuda.is_available() else -1
        summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", revision="a4f8f3e", device=device)
    except RuntimeError as e:
        if "CUDA error" in str(e):
            print("CUDA error encountered. Switching to CPU...")
            summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", revision="a4f8f3e", device=-1)
        else:
            raise e

    # Split the text into manageable chunks and summarize each
    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]  # Split text into chunks
    summaries = []

    for chunk in chunks:
        input_length = len(chunk.split())  # Tokenize the chunk to estimate its length
        dynamic_max_length = min(max_length, max(11, input_length - 1))  # Adjust max_length dynamically
        dynamic_min_length = min(min_length, dynamic_max_length - 1)  # Ensure min_length is less than max_length

        # Summarize the chunk
        summary = summarizer(chunk, max_length=dynamic_max_length, min_length=dynamic_min_length, do_sample=False)[0]['summary_text']
        summaries.append(summary)

    return " ".join(summaries)

# Main function
def main():
    # Define the file path here
    pdf_path = "/kaggle/input/datasettest/about-love.pdf"  # Replace with your PDF file's actual path
    
    # Check if the file exists
    if not os.path.exists(pdf_path):
        print(f"Error: The file at '{pdf_path}' does not exist. Please check the path and try again.")
        return

    # Extract text from PDF
    print("Extracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)
    
    # Summarize the extracted text
    print("Summarizing the text...")
    try:
        summary = summarize_text(text)
    except Exception as e:
        print(f"An error occurred during summarization: {e}")
        return

    # Output the summary
    print("\nSummary:")
    print(summary)

    # Save the summary to a file
    output_path = "summary.txt"
    with open(output_path, "w") as file:
        file.write(summary)
    print(f"\nSummary saved to '{output_path}'.")

# Run the main function
if __name__ == "__main__":
    main()

In [None]:


# ================= #
#   Image Caption   #
# ================= #




import warnings
warnings.filterwarnings("ignore")

# Import required libraries
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import requests
import torch

# Function to describe an image
def describe_image(image_path_or_url):
    # Load BLIP model and processor
    device = "cuda" if torch.cuda.is_available() else "cpu"
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

    # Load the image
    if image_path_or_url.startswith("http"):
        image = Image.open(requests.get(image_path_or_url, stream=True).raw)
    else:
        image = Image.open(image_path_or_url)

    # Preprocess the image and generate caption
    inputs = processor(images=image, return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    caption = processor.decode(outputs[0], skip_special_tokens=True)

    return caption

# Example usage
if __name__ == "__main__":
    # Provide an image file path or URL
    image_url = "/kaggle/input/datasettest/image0-3.jpg"  # Replace with your image URL or file path
    print("Describing the image...")
    caption = describe_image(image_url)
    print("Caption:", caption)

In [None]:


# ================================= #
#   Video Transcript & Summarizer  #
# ================================ #




# Install necessary libraries
!pip install moviepy openai-whisper transformers > /dev/null 2>&1


# Import libraries
import os
import whisper
import torch
from transformers import pipeline
from moviepy.editor import VideoFileClip

# Suppress ALSA warnings
os.environ["XDG_RUNTIME_DIR"] = "/tmp/runtime-dir"
os.makedirs(os.environ["XDG_RUNTIME_DIR"], exist_ok=True)

# Ensure proper runtime environment
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Function to extract audio from video
def extract_audio_from_video(video_path, audio_path="temp_audio.wav"):
    try:
        print("Extracting audio from video...")
        video_clip = VideoFileClip(video_path)
        video_clip.audio.write_audiofile(audio_path, verbose=False, logger=None)
        return audio_path
    except Exception as e:
        print(f"An error occurred during audio extraction: {e}")
        return None

# Function to transcribe audio using Whisper
def transcribe_audio(audio_path, model_name="base"):
    try:
        print("Transcribing audio...")
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = whisper.load_model(model_name, device=device)
        print(f"Device set to use {device}")
        result = model.transcribe(audio_path)
        transcript = result["text"]
        return transcript
    except Exception as e:
        print(f"An error occurred during transcription: {e}")
        return None

# Function to summarize text using Hugging Face's transformers
def summarize_text(text, max_length=130, min_length=30):
    try:
        print("Summarizing text...")
        device = 0 if torch.cuda.is_available() else -1
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=device)
        summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
        return summary[0]['summary_text']
    except Exception as e:
        print(f"An error occurred during summarization: {e}")
        return None

# Main function to summarize a video
def summarize_video(video_path):
    # Extract audio from video
    audio_path = extract_audio_from_video(video_path)
    if not audio_path:
        return
    
    # Transcribe the extracted audio
    transcript = transcribe_audio(audio_path)
    if not transcript:
        return
    
    print("\nTranscript:")
    print(transcript)
    
    # Summarize the transcript
    summary = summarize_text(transcript)
    if not summary:
        return
    
    print("\nSummary:")
    print(summary)

    # Save the summary to a file
    with open("video_summary.txt", "w") as file:
        file.write(summary)
    print("\nSummary saved to 'video_summary.txt'.")

# Run the summarizer
if __name__ == "__main__":
    # Replace `input()` with a hardcoded file path or use command-line arguments
    video_path = "/kaggle/input/datasettest/Life of a Doduo _ Pokearth.mp4"  # Replace with the path to your video file
    if not os.path.exists(video_path):
        print(f"Error: The file at '{video_path}' does not exist. Please check the path and try again.")
    else:
        summarize_video(video_path)

In [None]:


# ===================== #
#    Audio Transcript   #
# ===================== #


# Install necessary libraries
!pip install openai-whisper ffmpeg > /dev/null 2>&1

# Import libraries
import os
import whisper
import torch  # Ensure torch is imported
from moviepy.editor import AudioFileClip

# Ensure proper runtime settings
if "XDG_RUNTIME_DIR" not in os.environ:
    os.environ["XDG_RUNTIME_DIR"] = "/tmp/runtime-dir"
    os.makedirs(os.environ["XDG_RUNTIME_DIR"], exist_ok=True)
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Debugging for CUDA

# Function to transcribe audio using Whisper
def transcribe_audio(audio_path, model_name="base"):
    try:
        # Load the Whisper model
        device = "cuda" if torch.cuda.is_available() else "cpu"
        model = whisper.load_model(model_name, device=device)
        print(f"Device set to use {device}")

        # Transcribe the audio file
        print("Transcribing audio...")
        result = model.transcribe(audio_path)
        transcript = result["text"]
        return transcript
    except Exception as e:
        print(f"An error occurred during transcription: {e}")
        return None

# Main function
def main():
    # Define the audio file path directly
    audio_path = "/kaggle/input/datasettest/Monster (April Fools)  EPIC_ The Musical ANIMATIC.mp3"  # Replace with your actual file path
    
    # Check if the file exists
    if not os.path.exists(audio_path):
        print(f"Error: The file at '{audio_path}' does not exist. Please check the path and try again.")
        return
    
    # Transcribe the audio
    transcript = transcribe_audio(audio_path)
    
    # Output the transcript
    if transcript:
        print("\nTranscript:")
        print(transcript)

        # Save the transcript to a file
        output_path = "transcript.txt"
        with open(output_path, "w") as file:
            file.write(transcript)
        print(f"\nTranscript saved to '{output_path}'.")

# Run the main function
if __name__ == "__main__":
    main()

In [None]:


# ================================== #
#   AI Media Analyzer Web App Cell   #
# ================================== #




# Install dependencies
!pip install -q gradio PyPDF2 transformers pillow torch openai-whisper moviepy

import gradio as gr
import os
import tempfile

import PyPDF2
from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import whisper
from moviepy.editor import VideoFileClip

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


# Function
def summarize_text(text, max_length=130, min_length=30):
    device = 0 if torch.cuda.is_available() else -1
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", device=device)
    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
    summaries = []
    for chunk in chunks:
        input_length = len(chunk.split())
        dynamic_max_length = min(max_length, max(11, input_length-1))
        dynamic_min_length = min(min_length, dynamic_max_length-1)
        summary = summarizer(chunk, max_length=dynamic_max_length, min_length=dynamic_min_length, do_sample=False)[0]['summary_text']
        summaries.append(summary)
    return " ".join(summaries)

# Main function for type of file #
def extract_text_from_pdf(pdf_path):
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    text = ""
    for page in pdf_reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
    return text

def describe_image(image_path):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
    image = Image.open(image_path)
    inputs = processor(images=image, return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

def transcribe_audio(audio_path, model_name="base"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = whisper.load_model(model_name, device=device)
    result = model.transcribe(audio_path)
    return result["text"]

def extract_audio_from_video(video_path):
    audio_temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    video_clip = VideoFileClip(video_path)
    video_clip.audio.write_audiofile(audio_temp.name, verbose=False, logger=None)
    return audio_temp.name

def ai_media_analyzer(file_path):
    if file_path is None:
        return None, None, None, "Error:", "No file provided!"
    file_ext = os.path.splitext(file_path)[-1].lower()

    # PDF format
    if file_ext == ".pdf":
        text = extract_text_from_pdf(file_path)
        summary = summarize_text(text)
        return None, None, None, "Summary:", summary

    # Image format
    elif file_ext in [".jpg", ".jpeg", ".png"]:
        caption = describe_image(file_path)
        return file_path, None, None, "Caption:", caption

    # Audio format
    elif file_ext in [".mp3", ".wav"]:
        transcript = transcribe_audio(file_path)
        return None, file_path, None, "Transcript:", transcript

    # Video format
    elif file_ext in [".mp4", ".avi", ".mov"]:
        audio_path = extract_audio_from_video(file_path)
        transcript = transcribe_audio(audio_path)
        summary = summarize_text(transcript)
        return None, None, file_path, "Transcript & Summary:", f"{transcript}\n\nSummary:\n{summary}"

    else:
        return None, None, None, "Error:", "Unsupported file type."

with gr.Blocks() as demo:
    gr.Markdown("# AI Media Analyzer\nUpload a PDF, image, audio, or video file. The AI will generate a transcript, caption, or summary as appropriate.")

    inp = gr.File(label="Upload a file")
    out_image = gr.Image(label="Image Preview", visible=False)
    out_audio = gr.Audio(label="Audio Preview", visible=False)
    out_video = gr.Video(label="Video Preview", visible=False)
    out_type = gr.Textbox(label="Analysis Type")
    out_result = gr.Textbox(label="AI Output")

    def analyze_and_preview(file):
        if file is None:
            return None, None, None, "Error:", "No file provided!"
        img, aud, vid, type_, result = ai_media_analyzer(file)
        # Set visibility based on file type
        return (
            img if img else None,
            aud if aud else None,
            vid if vid else None,
            type_,
            result
        )

    inp.change(
        analyze_and_preview, 
        inputs=inp, 
        outputs=[out_image, out_audio, out_video, out_type, out_result]
    )

demo.launch(share=True)