## Extract offensive images from another dataset

https://www.kaggle.com/datasets/parthplc/facebook-hateful-meme-dataset

In [1]:
!ls ./facebook-hateful-meme-dataset/


ls: cannot access './facebook-hateful-meme-dataset/': No such file or directory


In [None]:

from google.colab import files
uploaded = files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!pip install kaggle
!kaggle datasets download -d parthplc/facebook-hateful-meme-dataset

!unzip facebook-hateful-meme-dataset.zip -d facebook-hateful-meme-dataset

# Load the JSONL training file (train.jsonl) for text-based data
import pandas as pd
import json

# Load training data from JSONL
# Correct path to train.jsonl
train_data = []
with open('./facebook-hateful-meme-dataset/data/train.jsonl', 'r') as f:
    for line in f:
        train_data.append(json.loads(line))

# Convert to DataFrame
train_df = pd.DataFrame(train_data)

# View first few rows of the DataFrame
train_df.head()


# Load images
import os
from PIL import Image

# Set the base directory for images
image_dir = './facebook-hateful-meme-dataset/data/'

# Load and display the first image
sample_image = Image.open(os.path.join(image_dir, train_df.iloc[0]['img']))
sample_image.show()




In [None]:
# Filter the DataFrame where label == 1
offensive_images = train_df[train_df['label'] == 1].head(20)

# Set up a figure for displaying multiple images
plt.figure(figsize=(20, 10))

# Loop through the first 20 offensive images and display them
for i, (index, row) in enumerate(offensive_images.iterrows()):
    plt.subplot(4, 5, i + 1)  # 4 rows, 5 columns grid
    img_path = os.path.join(image_dir, row['img'])
    img = Image.open(img_path)
    plt.imshow(img)
    plt.axis('off')  # Hide axis

plt.show()


In [None]:
# Show descriptive statistics for the dataset
train_df.head()


In [None]:

train_df.info()


In [None]:
train_df.describe(include='all')

In [None]:
train_df['label'].value_counts()

In [None]:
display(train_df.head(2))

In [None]:
# Check for NaN values in the 'text' column
missing_texts = train_df['text'].isna().sum()
print(f"Number of missing (NaN) values in 'text' column: {missing_texts}")

# Check for empty strings in the 'text' column
empty_texts = (train_df['text'] == '').sum()
print(f"Number of empty strings in 'text' column: {empty_texts}")


In [None]:
from google.colab import drive
drive.mount('/content/drive')
import shutil
import os
import pandas as pd

# Set Google Drive directory
output_dir = '/content/drive/MyDrive/curate138dataset/offensive_images/'
os.makedirs(output_dir, exist_ok=True)

# Filter 150 entries where label = 1
offensive_images = train_df[train_df['label'] == 1].head(150)

# Initialize list to store CSV data
csv_data = []

# Loop through the entries and copy images to Google Drive, while collecting data for CSV
for i, row in offensive_images.iterrows():
    img_path = os.path.join(image_dir, row['img'])
    output_path = os.path.join(output_dir, row['img'].split('/')[-1])
    shutil.copy(img_path, output_path)

    # Add data for CSV
    csv_data.append({
        'image_name': row['img'].split('/')[-1],
        'text': row['text'],
        'label': row['label'],
        'image_path': output_path
    })

# Create DataFrame for CSV
df_csv = pd.DataFrame(csv_data)

# Save DataFrame to CSV
csv_path = '/content/drive/MyDrive/curate138dataset/offensive_images_text.csv'
df_csv.to_csv(csv_path, index=False)

print(f'Successfully copied {len(offensive_images)} images and saved CSV to {csv_path}')


## Optical Character Recognition (OCR)
to extract text from the images and form a CSV with the columns image_name, sentence, label, and image_path

In [None]:
!apt-get install tesseract-ocr
!pip install pytesseract Pillow


In [None]:
import pytesseract
import pandas as pd
import os
from PIL import Image

# Initialize an empty list to store the data
data = []

# Path to the offensive images folder
offensive_images_dir = '/content/drive/MyDrive/curate138dataset/offensive_images/'

# Loop over images in the folder
for img_name in os.listdir(offensive_images_dir):
    if img_name.endswith('.png'):
        img_path = os.path.join(offensive_images_dir, img_name)
        img = Image.open(img_path)

        # Extract text using pytesseract
        sentence = pytesseract.image_to_string(img)

        # Append data as a dictionary
        data.append({
            'image_name': img_name,
            'sentence': sentence.strip(),  # Clean text
            'label': 1,  # Set label as 1 for offensive images
            'image_path': img_path
        })

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

# Save DataFrame to CSV
df.to_csv('/content/drive/MyDrive/curate138dataset/offensive_images_text.csv', index=False)

print("CSV file saved successfully.")


Did not proceed with OCR as a lot of text were lost in the process. I will retain accurate text using the data provided.

In [None]:
import os
from PIL import Image
import matplotlib.pyplot as plt

# Path to the offensive images folder
offensive_images_dir = '/content/drive/MyDrive/curate138dataset/offensive_images/'

# Get the list of PNG images in the folder
image_files = [f for f in os.listdir(offensive_images_dir) if f.endswith('.png')]

# Display the first few images
plt.figure(figsize=(20, 10))

for i, img_name in enumerate(image_files[:20]):  # Show first 20 images
    img_path = os.path.join(offensive_images_dir, img_name)

    try:
        img = Image.open(img_path)
        plt.subplot(4, 5, i + 1)  # 4x5 grid
        plt.imshow(img)
        plt.axis('off')
    except Exception as e:
        print(f"Could not open {img_name}: {e}")

plt.show()


In [None]:
import pandas as pd

# Load the CSV file into a DataFrame
csv_path = '/content/drive/MyDrive/curate138dataset/offensive_images_text.csv'
df = pd.read_csv(csv_path)

# View the first 5 rows of the DataFrame
df.head()


In [None]:
df.info()


# Loading curated images

In [19]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Manually looked through the images and remove images that are very vague to refine data for combination.

In [20]:
import os

# List of image names to remove
images_to_remove = [
    '93072.png', '92016.png', '91704.png', '63710.png', '54817.png',
    '36081.png', '34791.png', '02849.png', '01524.png', '06483.png', '08593.png'
]

# Path to the offensive images folder
offensive_images_dir = '/content/drive/MyDrive/curate138dataset/offensive_images/'

# Remove images
for img_name in images_to_remove:
    img_path = os.path.join(offensive_images_dir, img_name)
    if os.path.exists(img_path):
        os.remove(img_path)
        print(f"Removed {img_name}")
    else:
        print(f"{img_name} not found")


93072.png not found
92016.png not found
91704.png not found
63710.png not found
54817.png not found
36081.png not found
34791.png not found
02849.png not found
01524.png not found
06483.png not found
08593.png not found


In [21]:
import pandas as pd

# Path to the CSV file
csv_path = '/content/drive/MyDrive/curate138dataset/offensive_images_text.csv'

# Load the CSV into a DataFrame
df = pd.read_csv(csv_path)

# Filter out the rows where image_name matches the ones to remove
df = df[~df['image_name'].isin(images_to_remove)]

# Save the updated DataFrame back to CSV
df.to_csv(csv_path, index=False)

print("Updated CSV file saved.")


Updated CSV file saved.


In [22]:
# List rows where 'sentence' is NaN or empty
empty_sentences = df[df['text'].isna() | (df['text'] == '')]

# Display the rows with missing or empty sentences
empty_sentences


Unnamed: 0,image_name,text,label,image_path


# To match the main dataset, I will proceed to standardise and preprocess

In [23]:
# Load the current dataset
csv_path = '/content/drive/MyDrive/curate138dataset/offensive_images_text.csv'
df = pd.read_csv(csv_path)

# Rename
df_standardized = df.rename(columns={
    'text': 'sentence',
})

# Rearrange columns to match the desired structure: image_name, sentence, label
df_standardized = df_standardized[['image_name', 'sentence', 'label', 'image_path']]

# View the first few rows of the standardized DataFrame
df_standardized.head()

# Save the standardized dataset to a new CSV file
df_standardized.to_csv('/content/drive/MyDrive/curate138dataset/standardized_offensive_images_text.csv', index=False)

print("Dataset standardized and saved successfully.")


Dataset standardized and saved successfully.


In [24]:
# View the first few rows of the DataFrame
df_standardized.head()

# To view the entire DataFrame, you can simply use:
print(df_standardized)


    image_name                                           sentence  label  \
0    79351.png                           jew mad? get fuhrerious!      1   
1    25489.png   brother... a day without a blast is a day wasted      1   
2    72640.png  is bribing muslims for liberal votes justin tr...      1   
3    93547.png  d.j. osama spin laden droppin' beats like the ...      1   
4    74386.png  we said we would never forget why are you voti...      1   
..         ...                                                ...    ...   
134  20759.png      this steak is so black it stole my television      1   
135  62435.png                 wing wing wing herro? thats racist      1   
136  62849.png                 cumshot on jewish girl face - 1943      1   
137  62081.png  well let me tell you something brother fuck ni...      1   
138  85923.png  ford owner blew a rod wasnt talking about his ...      1   

                                            image_path  
0    /content/drive/MyDrive/cu

In [14]:
!pip install contractions
import contractions
import re




In [25]:
# Standardize texts
def pB_sentence(text):
    text = contractions.fix(text)  # Expand contractions
    text = text.lower()  # Convert to lowercase
    # Remove non-alphanumeric characters except for exclamation marks and question marks
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.strip()  # Remove leading and trailing whitespace
    return text  # Return the processed text

# Apply the basic preprocessing function to the 'sentence' column
df_standardized['sentence'] = df_standardized['sentence'].apply(pB_sentence)

# Check a few processed sentences
print(df_standardized['sentence'].head())

0                               jew mad get fuhrerious
1        brother a day without a blast is a day wasted
2    is bribing muslims for liberal votes justin tr...
3    dj osama spin laden droppin beats like the twi...
4    we said we would never forget why are you voti...
Name: sentence, dtype: object


##Image

In [27]:
from PIL import Image


In [28]:
# Function to retrieve image details
def get_image_details(image_path):
    try:
        img = Image.open(image_path)
        image_format = img.format  # Get image format
        image_size = img.size  # Get image size (width, height)
        image_mode = img.mode  # Get image mode
        return image_format, image_size, image_mode
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None, None, None

# Apply the function to get details for each image in the dataset
df_standardized['image_format'], df_standardized['image_size'], df_standardized['image_mode'] = zip(*df_standardized['image_path'].apply(get_image_details))
# Count the occurrences of each image format
image_format_counts = df_standardized['image_format'].value_counts()
# Display the counts of each image format
print("\nImage format counts:")
print(image_format_counts)


Image format counts:
image_format
PNG    139
Name: count, dtype: int64


In [29]:
# Check if images are in RGB mode
rgb_images = df_standardized[df_standardized['image_mode'] == 'RGB']
non_rgb_images = df_standardized[df_standardized['image_mode'] != 'RGB']

# Display the count of RGB and non-RGB images
print(f"Number of RGB images: {len(rgb_images)}")
print(f"Number of non-RGB images: {len(non_rgb_images)}")


Number of RGB images: 139
Number of non-RGB images: 0


In [30]:
# Save the processed DataFrame to the CSV file in Google Drive
df_standardized.to_csv('/content/drive/MyDrive/curate138dataset/standardized_offensive_images_text.csv', index=False)

print("CSV file updated successfully.")


CSV file updated successfully.
