<a href="https://colab.research.google.com/github/Akhilesh348/Multilingual-Memes-Classification-Harmful-Non-Harmful-/blob/main/Text_Encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install pytesseract
!pip install transformers

In [None]:
import os
import zipfile
zip_files = [
    "/content/Harmful Telugu memes.zip",
    "/content/Non-Harmful Telugu memes.zip"
]

for zip_path in zip_files:
    folder_name = os.path.splitext(os.path.basename(zip_path))[0]
    extract_to = f"/content/{folder_name}"
    os.makedirs(extract_to, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print(f" Extracted {zip_path} â†’ {extract_to}")

In [9]:
import sys
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
from typing import Generator, Optional, Tuple
import csv

try:
    from PIL import Image
except ImportError:
    print("Pillow is required. Install with: pip install pillow")
    raise

try:
    import pytesseract
except ImportError:
    print("pytesseract is required. Install with: pip install pytesseract")
    raise

try:
    import cv2
    import numpy as np
    HAS_OPENCV = True
    HAS_CUDA = cv2.cuda.getCudaEnabledDeviceCount() > 0
except ImportError:
    HAS_OPENCV = False
    HAS_CUDA = False
    print("Warning: opencv-python not found. Preprocessing will be skipped.")

IMAGE_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif', '.webp'}

# Define dataset paths directly
SRC_FOLDERS = [
    Path("/content/Harmful Telugu memes"),
    Path("/content/Non-Harmful Telugu memes"),
]
OUT_DIR = Path("/content/output")
LANG = "tel+eng"
USE_PREPROCESS = True
WORKERS = 2


def preprocess_image_for_ocr(image_path: Path):
    """GPU-accelerated preprocessing using OpenCV CUDA if available."""
    if not HAS_OPENCV:
        raise RuntimeError("opencv-python required for preprocessing.")

    img = cv2.imread(str(image_path))
    if img is None:
        raise ValueError(f"Failed to read image: {image_path}")

    if HAS_CUDA:
        try:
            gpu_img = cv2.cuda_GpuMat()
            gpu_img.upload(img)
            gpu_gray = cv2.cuda.cvtColor(gpu_img, cv2.COLOR_BGR2GRAY)
            gpu_blur = cv2.cuda.bilateralFilter(gpu_gray, 9, 75, 75)
            gray = gpu_blur.download()
        except Exception as e:
            print(f" CUDA preprocessing failed for {image_path.name}, using CPU: {e}")
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    else:
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        gray = cv2.bilateralFilter(gray, 9, 75, 75)

    th = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                               cv2.THRESH_BINARY, 11, 2)
    return th


def extract_text_from_image(image_path: Path, use_preprocess: bool, lang: str = 'eng') -> str:
    """Perform OCR using pytesseract."""
    try:
        if use_preprocess:
            img = preprocess_image_for_ocr(image_path)
            text = pytesseract.image_to_string(img, lang=lang)
        else:
            with Image.open(image_path) as pil_img:
                text = pytesseract.image_to_string(pil_img, lang=lang)
        return text.strip()
    except Exception as e:
        raise RuntimeError(f"OCR failed for {image_path.name}: {e}") from e


def find_images(folder: Path) -> Generator[Path, None, None]:
    """Find all image files in a folder recursively."""
    for p in sorted(folder.rglob('*')):
        if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS:
            yield p


def process_single_image(args_tuple: Tuple[Path, bool, str]) -> Tuple[str, str, Optional[str]]:
    """OCR one image."""
    img_path, use_preprocess, lang = args_tuple
    try:
        text = extract_text_from_image(img_path, use_preprocess, lang)
        return (img_path.name, text, None)
    except Exception as e:
        return (img_path.name, "", str(e))


def main():
    try:
        pytesseract.get_tesseract_version()
    except Exception as e:
        print("Error: Tesseract not found. Please install it.")
        print(f"Details: {e}")
        sys.exit(1)

    OUT_DIR.mkdir(parents=True, exist_ok=True)

    images = []
    for folder in SRC_FOLDERS:
        if not folder.exists():
            print(f" Warning: Folder not found - {folder}")
            continue
        folder_images = list(find_images(folder))
        images.extend(folder_images)
        images=images
        print(f"Found {len(folder_images)} images in {folder.name}")

    if not images:
        print(" No images found in provided folders.")
        sys.exit(0)

    print(f"\n Total images: {len(images)}")
    print(f" Language: {LANG}, Preprocess: {USE_PREPROCESS}")
    print(f" Using {'GPU' if HAS_CUDA else 'CPU'} for preprocessing.\n")

    results = []

    # Parallel OCR
    if WORKERS > 1:
        tasks = [(img, USE_PREPROCESS, LANG) for img in images]
        with ProcessPoolExecutor(max_workers=WORKERS) as executor:
            futures = {executor.submit(process_single_image, t): t[0] for t in tasks}
            for future in as_completed(futures):
                name, text, error = future.result()
                if error:
                    print(f" Error: {name} - {error}")
                else:
                    print(f" OCR done: {name}")
                results.append((name, text))
    else:
        for img in images:
            name, text, error = process_single_image((img, USE_PREPROCESS, LANG))
            if error:
                print(f" Error: {name} - {error}")
            else:
                print(f" OCR done: {name}")
            results.append((name, text))

    # Save combined CSV
    csv_path = OUT_DIR / "test.csv"
    with open(csv_path, "w", newline='', encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["imageid", "text"])
        for idx, (name, text) in enumerate(results, start=1):
            image_id = f"img{idx}"
            writer.writerow([image_id, text.replace("\n", " ").strip()])

    print(f"\n Combined CSV file saved at: {csv_path}")


if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

#Load model and tokenizer
model_name = "ai4bharat/indic-bert"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()  # Set model to evaluation mode

# Load CSV file
df = pd.read_csv("/content/test.csv")
print("Loaded rows:", len(df))
print(df.head())

# Encode Telugu text
embeddings_list = []

for text in tqdm(df['text'], desc="Encoding texts"):
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            outputs = model(**inputs)
        # Mean pooling
        emb = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings_list.append(emb)
    except Exception as e:
        print(f"Error encoding: {text}\n{e}")
        embeddings_list.append([0]*768)  # fallback vector

import numpy as np

embeddings_array = np.vstack(embeddings_list)
print("Final embeddings shape:", embeddings_array.shape)

#Save as NumPy file
np.save("text_embeddings.npy", embeddings_array)

#Save as CSV (with image_id for reference)
emb_df = pd.DataFrame(embeddings_array)
emb_df.insert(0, "imageid", df['imageid'])
emb_df.to_csv("text_embeddings.csv", index=False)

print(" Saved embeddings to text_embeddings.csv and text_embeddings.npy")
