# Document Pipeline

## Handles PDF, PPTX and Txt

In [1]:
import os
import json
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
from pptx import Presentation

class DocumentReader:
    """A class for reading and extracting text from various document formats."""

    def __init__(self, file_path):
        """
        Initialize the DocumentReader with a file path.

        Args:
            file_path (str): Path to the document file

        Raises:
            FileNotFoundError: If the specified file does not exist
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"The file {file_path} does not exist.")
            
        self.file_path = file_path
        self.file_extension = os.path.splitext(file_path)[1].lower()

    def extract_text(self):
        """
        Extract text content from the document based on its file type.

        Returns:
            str: Extracted text content from the document

        Raises:
            ValueError: If the file type is not supported
        """
        handlers = {
            '.txt': self._extract_text_from_txt,
            '.pdf': self._extract_text_from_pdf,
            '.pptx': self._extract_text_from_pptx
        }

        handler = handlers.get(self.file_extension)
        if handler:
            return handler()
        else:
            raise ValueError(f"Unsupported file extension: {self.file_extension}")

    def _extract_text_from_txt(self):
        """
        Extract text from a plain text file.

        Returns:
            str: Content of the text file
        """
        with open(self.file_path, 'r', encoding='utf-8') as file:
            return file.read()

    def _extract_text_from_pdf(self):
        """
        Extract text from a PDF file, including text from images using OCR.

        Returns:
            str: Combined text content from all PDF pages
        """
        doc = fitz.open(self.file_path)
        text_contents = []

        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text("text")

            if text.strip():
                text_contents.append(text)
            else:
                # If no text is found, try OCR
                text_contents.append(self._extract_text_from_pdf_image(page))

        return '\n'.join(text_contents)

    def _extract_text_from_pdf_image(self, page):
        """
        Extract text from an image-based PDF page using OCR.

        Args:
            page: PyMuPDF Page object

        Returns:
            str: Extracted text from the page image
        """
        zoom = 2.0  # Higher zoom factor for better OCR accuracy
        matrix = fitz.Matrix(zoom, zoom)
        pixmap = page.get_pixmap(matrix=matrix)
        
        image = Image.frombytes(
            "RGB", 
            [pixmap.width, pixmap.height], 
            pixmap.samples
        )
        
        return pytesseract.image_to_string(image)

    def _extract_text_from_pptx(self):
        """
        Extract text from a PowerPoint presentation.

        Returns:
            str: Combined text content from all slides
        """
        presentation = Presentation(self.file_path)
        text_contents = []

        for slide in presentation.slides:
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    text_contents.append(shape.text)

        return '\n'.join(text_contents)


def parse_json(json_string):
    """
    Parse a JSON string into a Python object.

    Args:
        json_string (str): JSON string to parse, optionally with markdown code block markers

    Returns:
        dict/list: Parsed JSON object

    Raises:
        json.JSONDecodeError: If the JSON string is invalid
    """
    cleaned_string = json_string.replace('```json', '').replace('```', '').strip()
    
    return json.loads(cleaned_string)

In [3]:
file_path = ['docs/sample.pptx', 'docs/soil.pdf', 'docs/sample.txt']
extracted_files = []
for file in file_path:
    reader = DocumentReader(file)
    text = reader.extract_text()
    extracted_files.append(text)
extracted_files

['Various Image Segmentation Techniques:\nDifferent types of Image Segmentation Techniques\nThresholding technique segmentation\nHistogram based segmentation\nRegion based segmentation\nEdge based segmentation\nClustering based segmentation\nMorphological Transforms and\nTexture based segmentation approaches\nThresholding technique segmentation\nSegmentation algorithms based on thresholding approach are suitable for images where there is distinct difference between object and background.\nMain Goal: divide an image into two distinct regions (object and background) directly based on intensity values and their properties\nTypes: Global, Variable, Multiple\n\nOriginal coins image\n2)Histogram based segmentation\nHistogram of an image is a plot between intensity levels. \nDeep valleys are used to separate different peaks of histogram. \nHistogram peaks are tall, narrow, symmetric.\n3) Region based Segmentation\nThe region-based segmentation methods segments the image into various regions h

# Youtube Pipeline

In [None]:
import os
import torch
import librosa
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from typing import List, Tuple

In [None]:
class WhisperTranscriber:
    def __init__(self, model_name: str = "openai/whisper-tiny"):
        """
        Initialize the WhisperTranscriber with a specified model.

        Args:
            model_name (str): The name of the Whisper model to use.
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {self.device}")

        self.processor = WhisperProcessor.from_pretrained(model_name)
        self.model = WhisperForConditionalGeneration.from_pretrained(model_name).to(self.device)
        self.model.eval()

    @staticmethod
    def load_audio(file_path: str, target_sampling_rate: int = 16000) -> Tuple[np.ndarray, int]:
        """
        Load and resample audio to 16kHz if necessary using librosa.

        Args:
            file_path (str): Path to the audio file.
            target_sampling_rate (int): Target sampling rate (default: 16000).

        Returns:
            Tuple[np.ndarray, int]: Resampled audio array and sampling rate.
        """
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Audio file not found: {file_path}")

        try:
            audio, sr = librosa.load(file_path, sr=target_sampling_rate, mono=True)
            return audio, sr
        except Exception as e:
            raise RuntimeError(f"Error loading audio file: {e}. Make sure the file format is supported by librosa.")

    @staticmethod
    def chunk_audio(audio: np.ndarray, chunk_length: int = 30, sampling_rate: int = 16000) -> List[np.ndarray]:
        """
        Split audio into chunks for long transcriptions.

        Args:
            audio (np.ndarray): The audio array.
            chunk_length (int): Length of each chunk in seconds (default: 30).
            sampling_rate (int): Sampling rate of the audio (default: 16000).

        Returns:
            List[np.ndarray]: List of audio chunks.
        """
        chunk_size = chunk_length * sampling_rate
        return [audio[i:i+chunk_size] for i in range(0, len(audio), chunk_size)]

    def transcribe_chunk(self, chunk: np.ndarray, sampling_rate: int = 16000) -> str:
        """
        Transcribe a single audio chunk.

        Args:
            chunk (np.ndarray): Audio chunk to transcribe.
            sampling_rate (int): Sampling rate of the audio (default: 16000).

        Returns:
            str: Transcribed text.
        """
        input_features = self.processor(chunk, sampling_rate=sampling_rate, return_tensors="pt").input_features
        input_features = input_features.to(self.device)

        with torch.no_grad():
            predicted_ids = self.model.generate(input_features)

        return self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    def transcribe(self, file_path: str, chunk_length: int = 30) -> str:
        """
        Transcribe long audio by chunking.

        Args:
            file_path (str): Path to the audio file.
            chunk_length (int): Length of each chunk in seconds (default: 30).

        Returns:
            str: Full transcription of the audio.
        """
        audio, sampling_rate = self.load_audio(file_path)
        chunks = self.chunk_audio(audio, chunk_length, sampling_rate)
        
        transcriptions = [self.transcribe_chunk(chunk, sampling_rate) for chunk in chunks]
        return " ".join(transcriptions)

# Medium Articles

In [4]:
import requests
from bs4 import BeautifulSoup
import json

def get_medium_article_content(url):
    try:
        # Send request to the URL
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        
        # Parse HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find article content
        article_content = ""
        
        # Get title
        title = soup.find('h1')
        if title:
            article_content += f"Title: {title.get_text()}\n\n"
        
        # Get main article content
        article = soup.find('article')
        if article:
            # Get all paragraphs
            paragraphs = article.find_all('p')
            for p in paragraphs:
                article_content += f"{p.get_text()}\n\n"
        
        return article_content.strip()
        
    except requests.exceptions.RequestException as e:
        return f"Error fetching the article: {str(e)}"
    except Exception as e:
        return f"Error processing the article: {str(e)}"

In [5]:
get_medium_article_content('https://medium.com/@pdx.lucasm/canvas-with-react-js-32e133c05258')

'Title: Canvas with React.js\n\nLucas Miranda\n\nFollow\n\n--\n\n12\n\nListen\n\nShare\n\nIn this article, we will see how to create a Canvas React component and a custom hook for extracting its logic, so we can just draw inside it like we usually draw in a regular canvas html element.\n\nThis article is based on Corey’s article “Animating a Canvas with React Hooks”. Any other sources and related contents are linked throughout this article.\n\nI am assuming that you already know canvas, but if you don’t know yet, I recommend this tutorial from MDN to you.\n\nIn order to see what we are doing, let’s create a new react app with create-react-app (feel free to skip this step if you are already familiar with React and create-react-app). You can start a new project by running npx create-react-app example or yarn create react-app example if you prefer yarn. If you open the project folder (example) in your code editor, you must get something like this:\n\nWe don’t need all these files, so we w