In [1]:
import pandas as pd
import os
import io
import re
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google.oauth2 import service_account
from googleapiclient.errors import HttpError
from PIL import Image

In [3]:
df = pd.read_excel("Cat Image Dataset Collection for AI Research  (Responses).xlsx")
# df.head(5)
# df.info()
df.columns = df.columns.str.strip()

df = df.rename(columns={
    "1️⃣Upload Cat Face Photo  ( বিড়ালের মুখের পরিষ্কার ছবি আপলোড করুন )": "Image_Link",
    "2️⃣ Cat Gender ( বিড়ালের লিঙ্গ )?": "Gender",
    "3️⃣ Cat Age ( বিড়ালের বয়স )?": "Age",
    "4️⃣Cat Breed / বিড়ালের জাত": "Breed",
    "5️⃣ Cat Neutering Status / বিড়ালের সার্জারি স্ট্যাটাস": "Neutering_Status",
    "6️⃣ Cat’s Photo Condition / ছবিটি কোথায় তোলা হয়েছে?": "Indoor_Outdoor",
    "7️⃣ Cat Temperament (Optional)" : "Temperament",
    "8️⃣ Permission / অনুমতি": "Permission",
    "Device Name.1": "Device_Name",
    "Location": "Location",
    "Timestamp": "Timestamp"
    
})

df.head()

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Timestamp             432 non-null    datetime64[ns]
 1   Image_Link            432 non-null    object        
 2   Gender                432 non-null    object        
 3   Age                   429 non-null    object        
 4   Breed                 355 non-null    object        
 5   Neutering_Status      405 non-null    object        
 6   Indoor_Outdoor        401 non-null    object        
 7   Temperament           393 non-null    object        
 8   Permission            432 non-null    object        
 9   Device Name           32 non-null     object        
 10  Day/Night             381 non-null    object        
 11  Image Size            363 non-null    object        
 12  Location              378 non-null    object        
 13  Device Name 2       

In [4]:
# -------------------- Google API setup --------------------
SERVICE_ACCOUNT_FILE = "cobalt-mind-448907-s8-b3c4968a1078.json"

SCOPES = ['https://www.googleapis.com/auth/drive.readonly']

creds = service_account.Credentials.from_service_account_file(
    SERVICE_ACCOUNT_FILE,
    scopes=SCOPES
)

drive_service = build('drive', 'v3', credentials=creds)

# -------------------- FIXED extractor --------------------
def extract_file_ids(text):
    """
    text: one Excel cell / API response string
    return: list of Google Drive file IDs
    """
    if not text:
        return []

    pattern = r"(?:https?:\/\/)?drive\.google\.com\/(?:file\/d\/|open\?id=)([a-zA-Z0-9_-]{10,})"
    return re.findall(pattern, str(text))

In [5]:
MALE_FOLDER = r"D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\Male"
FEMALE_FOLDER = r"D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\Female"
INVALID_LINKS_FILE = r"D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\invalid_links.txt"

In [8]:
male_counter = 1
female_counter = 1

invalid_links = []

for idx, row in df.iterrows():
    link_cell = row['Image_Link']
    gender = str(row['Gender']).strip().capitalize()  # Male / Female

    file_ids = extract_file_ids(link_cell)

    if not file_ids:
        print(f"[SKIP] No valid Drive links: {link_cell}")
        invalid_links.append(str(link_cell))
        continue

    for file_id in file_ids:
        try:
            # Download file from Drive
            request = drive_service.files().get_media(fileId=file_id)
            fh = io.BytesIO()
            downloader = MediaIoBaseDownload(fh, request)

            done = False
            while not done:
                status, done = downloader.next_chunk()

            fh.seek(0)

            # File extension (safe default)
            ext = ".jpg"

            if gender == "Male":
                filename = f"Cat_{str(male_counter).zfill(3)}_Male{ext}"
                folder = MALE_FOLDER
                male_counter += 1
            else:
                filename = f"Cat_{str(female_counter).zfill(3)}_Female{ext}"
                folder = FEMALE_FOLDER
                female_counter += 1

            save_path = os.path.join(folder, filename)

            with open(save_path, "wb") as f:
                f.write(fh.read())

            print(f"[DONE] {save_path}")

        except HttpError as e:
            print(f"[ERROR] Could not download file: {file_id} ({e})")
            invalid_links.append(f"{file_id} | {link_cell}")
            continue

if invalid_links:
    with open(INVALID_LINKS_FILE, "w", encoding="utf-8") as f:
        for link in invalid_links:
            f.write(link + "\n")

print("✅ All downloads complete!")


[DONE] D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\Male\Cat_001_Male.jpg
[DONE] D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\Female\Cat_001_Female.jpg
[DONE] D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\Male\Cat_002_Male.jpg
[DONE] D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\Female\Cat_002_Female.jpg
[DONE] D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\Female\Cat_003_Female.jpg
[DONE] D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\Female\Cat_004_Female.jpg
[DONE] D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\Male\Cat_003_Male.jpg
[DONE] D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\Female\Cat_005_Female.jpg
[DONE] D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\Female\Cat_006_Female.jpg
[DONE] D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\Male\Cat_004_Male.jpg
[DONE] D:\ML-AI\AI-ML Project\Cat Gender Classification Dataset\Female\Cat_007_Female.jpg
[DONE] D:\ML-AI\AI-ML Proj