In [None]:
import pandas as pd
df=pd.read_csv('/content/balanced_dataset.csv')
df.dropna(inplace=True)

In [None]:
import pandas as pd
id_column = pd.Series(range(1, len(df) + 1), name='ID')

# Concatenate the ID column with the original DataFrame
df = pd.concat([id_column, df], axis=1)

In [None]:
df.isna().sum()

Unnamed: 0,0
ID,0
url,0
text,0
images,0
top_img,0
authors,0
label,0
word_count,0
domain,0
tokens,0


## Image Extraction and download




In [None]:
import os
import requests
import pandas as pd
from PIL import Image
from io import BytesIO

# Create directories for each label and for empty labels
label_0_dir = 'downloaded_images/label_0'
label_1_dir = 'downloaded_images/label_1'
os.makedirs(label_0_dir, exist_ok=True)
os.makedirs(label_1_dir, exist_ok=True)

# Log file to store success and failure messages
log_file = 'download_log.txt'

# Function to check if the image is corrupted
def is_image_corrupted(img_data):
    try:
        img = Image.open(BytesIO(img_data))
        img.verify()  # Verify if the image is valid
        return False  # Image is valid
    except (IOError, SyntaxError):
        return True  # Image is corrupted


# Function to download and save images
def download_image(url, save_path):
    try:
        img_data = requests.get(url, timeout=50).content
        if is_image_corrupted(img_data):
            return False
        else:
            # Save the image to the specified path
            with open(save_path, 'wb') as f:
                f.write(img_data)
            return True
    except requests.exceptions.RequestException as e:
        # This will catch network-related errors or invalid URLs
        return False
    except Exception as e:
        # General exception for other issues (e.g., file system)
        return False


In [None]:

# Open log file in append mode to store the status of each image download
with open(log_file, 'a') as log:
    log.write("Image Download Log:\n")
    log.write("=" * 50 + "\n")

    # Loop through the 'top_img' column and download images
    for idx, row in df.iterrows():
        img_url = row['top_img']
        label = row['label']
        img_id = row['ID']

        if label == 0:
            save_dir = label_0_dir
        else:
            save_dir = label_1_dir

        # Define the image name based on row index
        image_name = f"image_{img_id}.jpg"  # Image name based on row index
        image_path = os.path.join(save_dir, image_name)

        # Download and save the image
        success = download_image(img_url, image_path)

        # Log the result and print status
        if success:
            log.write(f"SUCCESS: {img_url} -> ID: {img_id}\n")
            print(f"SUCCESS: {img_url} -> ID: {img_id}")  # Print status
        else:
            # Drop the row if the image download failed or the image is corrupted
            df.drop(idx, inplace=True)
            log.write(f"FAILED: {img_url} -> ID: {img_id} (Dropped row)\n")  # Log the ID for failed images
            print(f"FAILED: {img_url} -> ID: {img_id} (Dropped row)")  # Print status with ID

    log.write("=" * 50 + "\n")
    log.write("End of Log\n")


print("Image download complete, rows with failed images have been dropped, and log file created.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
SUCCESS: https://media.okmagazine.com/brand-img/MqHkwJTM4/1200x628/2017/04/HeaderKim.jpg -> ID: 1132
SUCCESS: https://media.radaronline.com/brand-img/C8M9n_QfS/1200x628/2018/05/Sofia-Richie-Scott-Disick-Kourtney-Kardashian-Feud-pp.jpg -> ID: 1133
SUCCESS: https://api.photon.aremedia.net.au/wp-content/uploads/sites/17/2024/05/cropped-NewIdeaFavIcon.png?fit=32%2C32 -> ID: 1134
FAILED: https://www.independent.co.uk/img/shortcut-icons/icon-96x96.png -> ID: 1135 (Dropped row)
SUCCESS: https://imgix.bustle.com/uploads/getty/2017/10/6/85ee6531-3c89-45b9-ba65-c7a49bb7951a-getty-844545642.jpg?w=1200&h=630&fit=crop&crop=faces&fm=jpg -> ID: 1136
SUCCESS: https://assets.teenvogue.com/photos/5a04a2aa70ae4e37c2ec6545/16:9/w_1280,c_limit/FB.jpg -> ID: 1137
SUCCESS: https://imaging.broadway.com/images/social/w630/21074-0.jpg -> ID: 1138
SUCCESS: https://cdn.thehollywoodgossip.com/uploads/2017/06/mehgan-james-in-a-yellow-jersey-scaled.jpg

In [None]:
import shutil
from google.colab import files

# Specify the folder you want to zip
folder_to_zip = '/content/downloaded_images'  # Replace with your folder name
output_zip = 'downloaded_images.zip'  # Desired name for the zip file

# Zip the folder
shutil.make_archive(output_zip.replace('.zip', ''), 'zip', folder_to_zip)

# Download the zip file
files.download(output_zip)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## *Pre-processing of the images*

#### Loading Image from drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


#### Unzip images
*   (To access images as an folder)




In [None]:
import zipfile
import os

# Path to the zip file
zip_file_path = '/content/drive/MyDrive/ANN_Project_data/downloaded_images_manual.zip'
extract_dir = '/content/downloaded_images/'

# Create the directory to extract the files
os.makedirs(extract_dir, exist_ok=True)

# Extract the contents of the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

# List the extracted files to confirm
extracted_files = os.listdir(extract_dir)
extracted_files

['downloaded_images_manual']

#### pre-processing of the images

* only performed reszizing as the images are mostly contain no noise
*   Note : augmentation will be performed based on model requirments
* **image after preprocessing : {'width': 224, 'height': 224, 'color_mode': 'RGB', 'channels': 3}**



In [None]:
import os
from PIL import Image, ImageOps
import pandas as pd

def preprocess_image(image_path, target_size=(224, 224)):
    """Preprocess image (resize with padding) without changing filename"""
    img = Image.open(image_path).convert('RGB')

    # Resize maintaining aspect ratio
    img.thumbnail(target_size, Image.Resampling.LANCZOS)

    # Calculate padding
    padding = (
        (target_size[0] - img.size[0]) // 2,
        (target_size[1] - img.size[1]) // 2,
        (target_size[0] - img.size[0] + 1) // 2,
        (target_size[1] - img.size[1] + 1) // 2
    )

    # Apply padding and return
    return ImageOps.expand(img, padding, (0, 0, 0))

def process_and_replace_images(source_directory):
    """
    Process images and overwrite originals with preprocessed versions
    (Keeps DataFrame handling from original code)
    """
    for root, dirs, files in os.walk(source_directory):
        for img_file in files:
            if img_file.lower().endswith(('png', 'jpg', 'jpeg')):
                try:
                    img_path = os.path.join(root, img_file)
                    processed_img = preprocess_image(img_path)

                    # Overwrite original image
                    processed_img.save(img_path)
                    print(f"Processed and replaced: {img_file}")

                except Exception as e:
                    print(f"Error processing {img_file}: {str(e)}")

# Process both directories (overwriting originals)
process_and_replace_images('/content/downloaded_images/downloaded_images_manual/label_0')
process_and_replace_images('/content/downloaded_images/downloaded_images_manual/label_1')

print("All original images have been replaced with preprocessed versions")

Processed and replaced: image_745.jpg
Processed and replaced: image_1070.jpg
Processed and replaced: image_2460.jpg
Processed and replaced: image_2936.jpg
Processed and replaced: image_2846.jpg
Processed and replaced: image_19.jpg
Processed and replaced: image_3053.jpg
Processed and replaced: image_1639.jpg
Processed and replaced: image_2125.jpg
Processed and replaced: image_2119.jpg
Processed and replaced: image_2413.jpg
Processed and replaced: image_264.jpg
Processed and replaced: image_405.jpg
Processed and replaced: image_2675.jpg
Processed and replaced: image_2469.jpg
Processed and replaced: image_178.jpg
Processed and replaced: image_2495.jpg
Processed and replaced: image_1218.jpg
Processed and replaced: image_1702.jpg
Processed and replaced: image_1950.jpg
Processed and replaced: image_911.jpg
Processed and replaced: image_1253.jpg
Processed and replaced: image_2.jpg
Processed and replaced: image_1765.jpg
Processed and replaced: image_2534.jpg
Processed and replaced: image_1096.



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processed and replaced: image_258.jpg
Processed and replaced: image_2053.jpg
Processed and replaced: image_1121.jpg
Processed and replaced: image_62.jpg
Processed and replaced: image_1589.jpg
Processed and replaced: image_110.jpg
Processed and replaced: image_2064.jpg
Processed and replaced: image_2555.jpg
Processed and replaced: image_1504.jpg
Processed and replaced: image_1991.jpg
Processed and replaced: image_2046.jpg
Processed and replaced: image_2316.jpg
Processed and replaced: image_1625.jpg
Processed and replaced: image_1976.jpg
Processed and replaced: image_1240.jpg
Processed and replaced: image_1874.jpg
Processed and replaced: image_1713.jpg
Processed and replaced: image_2142.jpg
Processed and replaced: image_1567.jpg
Processed and replaced: image_1723.jpg
Processed and replaced: image_2692.jpg
Processed and replaced: image_2930.jpg
Processed and replaced: image_335.jpg
Processed and replaced: image_2762.jpg
Proc

In [None]:

from PIL import Image

def get_image_metadata(image_path):
    with Image.open(image_path) as img:
        width, height = img.size
        mode = img.mode  # e.g., 'RGB', 'RGBA', 'L' (grayscale)
        return {
            "width": width,
            "height": height,
            "color_mode": mode,
            "channels": len(mode)  # 3 for RGB, 4 for RGBA, 1 for grayscale
        }

# Example Usage
metadata = get_image_metadata("/content/preprocessed_images/preprocessed_images/label_0/image_1.jpg")
print(metadata)

{'width': 224, 'height': 224, 'color_mode': 'RGB', 'channels': 3}


#### After the manual removal of the images
* run this to delete the rows in the dataset

In [None]:
import os
import pandas as pd
# Ensure ID is string
df['ID'] = df['ID'].astype(str)

# Define directories
label_0_dir = '/content/downloaded_images/downloaded_images_manual/label_0'
label_1_dir = '/content/downloaded_images/downloaded_images_manual/label_1'

def get_existing_image_ids(directory):
    """Get set of image IDs that exist in directory (without 'image_' prefix or extension)"""
    existing = set()
    for filename in os.listdir(directory):
        if filename.lower().endswith(('png', 'jpg', 'jpeg')):
            # Extract ID - removes 'image_' prefix and file extension
            img_id = os.path.splitext(filename)[0].replace('image_', '')
            existing.add(img_id)
    return existing

# Get all existing image IDs from both directories
existing_ids = get_existing_image_ids(label_0_dir).union(
               get_existing_image_ids(label_1_dir))

# Filter DataFrame to keep only rows with existing images
# (remove 'image_' prefix from ID when comparing)
df_filtered = df[df['ID'].str.replace('image_', '').isin(existing_ids)]

# Save the filtered DataFrame
df_filtered.to_csv('filtered_dataset_clean.csv', index=False)

# Print results
original_count = len(df)
filtered_count = len(df_filtered)
removed_count = original_count - filtered_count

print(f"Original rows: {original_count}")
print(f"Rows kept: {filtered_count}")
print(f"Rows removed: {removed_count}")
print("Filtered dataset saved to 'filtered_dataset_clean.csv'")

Original rows: 6130
Rows kept: 5556
Rows removed: 574
Filtered dataset saved to 'filtered_dataset_clean.csv'


In [None]:
len(df_filtered)

5556

####download zip file to drive

In [None]:
import shutil
import os
from google.colab import files

# Folder containing images to zip
folder_to_zip = '/content/downloaded_images'

# Where to save the zip (create the directory if it doesn't exist)
output_folder = '/content/drive/MyDrive/ANN_Project_data/preprocessed_images'  # ← YOUR TARGET FOLDER HERE
os.makedirs(output_folder, exist_ok=True)

# Full path for the zip file
output_zip = os.path.join(output_folder, 'downloaded_images.zip')  # Saves to /content/output_archives/

# Create the zip (saved to output_folder)
shutil.make_archive(
    base_name=output_zip.replace('.zip', ''),  # Remove .zip for make_archive
    format='zip',
    root_dir=folder_to_zip
)

# Download from Colab to your local machine
files.download(output_zip)

print(f"ZIP saved to: {output_zip}")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

ZIP saved to: /content/drive/MyDrive/ANN_Project_data/preprocessed_images/preprocessed_images.zip
