<a href="https://colab.research.google.com/github/Akhilesh348/Multilingual-Memes-Classification-Harmful-Non-Harmful-/blob/main/Global_Image_Feature_Encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch torchvision tqdm pillow


In [None]:
# Generate CLIP Embeddings for Two ZIP Folders

!pip install transformers torch torchvision pillow tqdm pandas --quiet

import os
import zipfile
import numpy as np
import pandas as pd
import torch
from PIL import Image
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel

# Paths to ZIP files
harmful_zip = "/content/Harmful Telugu memes.zip"
non_harmful_zip = "/content/Non-Harmful Telugu memes.zip"
extract_dir = "/content/memes"

# Extract zip files
os.makedirs(extract_dir, exist_ok=True)

def extract_zip(zip_path, folder_name):
    folder = os.path.join(extract_dir, folder_name)
    os.makedirs(folder, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(folder)
    return folder

harmful_folder = extract_zip(harmful_zip, "harmful")
non_harmful_folder = extract_zip(non_harmful_zip, "non_harmful")


# Load CLIP model & processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Function: encode images
def encode_images(folder_path, label):
    embeddings, labels, names = [], [], []
    files = [f for f in os.listdir(folder_path)
             if f.lower().endswith((".jpg", ".jpeg", ".png"))]

    for img_name in tqdm(files, desc=f"Encoding {label}"):
        img_path = os.path.join(folder_path, img_name)
        try:
            image = Image.open(img_path).convert("RGB")
            inputs = processor(images=image, return_tensors="pt").to(device)
            with torch.no_grad():
                features = model.get_image_features(**inputs)
            features = features.cpu().numpy().flatten()
            embeddings.append(features)
            labels.append(label)
            names.append(img_name)
        except Exception as e:
            print(f" Skipping {img_name}: {e}")
    return embeddings, labels, names

# Encode both folders
harmful_emb, harmful_labels, harmful_names = encode_images(harmful_folder, "harmful")
non_harmful_emb, non_harmful_labels, non_harmful_names = encode_images(non_harmful_folder, "non_harmful")

# Combine & Save
all_embeddings = np.vstack([harmful_emb, non_harmful_emb])
all_labels = harmful_labels + non_harmful_labels
all_names = harmful_names + non_harmful_names

# Save as .npy
np.save("clip_embeddings.npy", all_embeddings)
np.save("clip_labels.npy", np.array(all_labels))
np.save("clip_filenames.npy", np.array(all_names))

# Save as .csv
df = pd.DataFrame(all_embeddings)
df["label"] = all_labels
df["filename"] = all_names
df.to_csv("clip_embeddings.csv", index=False)

print("\n Embeddings generated successfully!")
print(" Saved files:")
print(" - clip_embeddings.npy")
print(" - clip_labels.npy")
print(" - clip_filenames.npy")
print(" - clip_embeddings.csv")
print("\nEmbedding shape:", all_embeddings.shape)
