In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd

In [3]:
train_path = '/kaggle/input/adobetraindata/behaviour_simulation_train.csv'
test_path = '/kaggle/input/inter-iit-mid-prep-adobe/problem_1_test_dataset'

In [4]:
train_dataset = pd.read_csv(train_path)

In [5]:
Photo_data = train_dataset[train_dataset['media'].apply(lambda x: x.startswith('[Photo'))]

In [6]:
img_links = Photo_data['media'].apply(lambda x:x.split("Photo(previewUrl")[1].split("'")[1])

In [7]:
import nest_asyncio
import asyncio
import aiohttp
from PIL import Image
from io import BytesIO
import time
import os
import concurrent.futures

# Apply the patch to allow nested event loops
nest_asyncio.apply()

# Constants
BATCH_SIZE = 10000
CONCURRENT_REQUESTS = 100
TOTAL_IMAGES = len(img_links)

async def fetch_image(session, url, semaphore):
    async with semaphore:
        try:
            async with session.get(url, timeout=30) as response:
                if response.status == 200:
                    return await response.read()
        except Exception as e:
            print(f"Error downloading {url}: {str(e)}")
    return None

async def download_batch(urls, start_index):
    semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_image(session, url, semaphore) for url in urls]
        results = await asyncio.gather(*tasks)
#         success_idx = [i for i,e in zip(idxs,results) if e is not None]
        successful = sum(1 for r in results if r is not None)
        print(f"Batch {start_index}-{start_index+len(urls)}: Downloaded {successful}/{len(urls)} images")
        
        return results

async def download_all_images(img_links):
    start_time = time.time()
    all_image_data = []
#     all_image_index =[]
    
    for i in range(0, len(img_links), BATCH_SIZE):
        batch = img_links[i:i+BATCH_SIZE]
        print(f"Starting batch {i}-{i+len(batch)}")
        batch_data = await download_batch(batch, i)
        all_image_data.extend(batch_data)
#         all_image_index.extend(idx_data)
        elapsed_time = time.time() - start_time
        images_downloaded = i + len(batch)
        avg_time_per_image = elapsed_time / images_downloaded
        estimated_total_time = avg_time_per_image * TOTAL_IMAGES
        estimated_remaining_time = estimated_total_time - elapsed_time
        
        print(f"Progress: {images_downloaded}/{TOTAL_IMAGES} images")
        print(f"Elapsed time: {elapsed_time:.2f} seconds")
        print(f"Estimated remaining time: {estimated_remaining_time:.2f} seconds")
        print(f"Estimated total time: {estimated_total_time:.2f} seconds")
        print("---")
    
    return all_image_data

def estimate_storage_space(image_data):
    total_size = sum(len(img) for img in image_data if img is not None)
    size_gb = total_size / (1024 * 1024 * 1024)
    return size_gb

# Main execution
if __name__ == "__main__":
    # Assume img_links is your list of 300,000 image URLs
#     img_links = [...] # Your list of image URLs here
    
    print(f"Starting download of {len(img_links)} images...")
    image_data = asyncio.run(download_all_images(list(img_links)))
    
    successful_downloads = sum(1 for img in image_data if img is not None)
    print(f"\nDownload complete. Successfully downloaded {successful_downloads}/{TOTAL_IMAGES} images.")
    
    storage_space = estimate_storage_space(image_data)
    print(f"Estimated storage space: {storage_space:.2f} GB")

    print("\nSample image info:")
    if image_data and image_data[0]:
        sample_image = Image.open(BytesIO(image_data[0]))
        print(f"Format: {sample_image.format}")
        print(f"Size: {sample_image.size}")
        print(f"Mode: {sample_image.mode}")
    else:
        print("No valid sample image found.")
    import pickle
    with open('image_data.pkl','wb') as f:
        pickle.dump(image_data,f)

Starting download of 210009 images...
Starting batch 0-10000
Batch 0-10000: Downloaded 9766/10000 images
Progress: 10000/210009 images
Elapsed time: 11.39 seconds
Estimated remaining time: 227.90 seconds
Estimated total time: 239.29 seconds
---
Starting batch 10000-20000
Batch 10000-20000: Downloaded 9755/10000 images
Progress: 20000/210009 images
Elapsed time: 22.97 seconds
Estimated remaining time: 218.24 seconds
Estimated total time: 241.21 seconds
---
Starting batch 20000-30000
Batch 20000-30000: Downloaded 9761/10000 images
Progress: 30000/210009 images
Elapsed time: 34.16 seconds
Estimated remaining time: 204.99 seconds
Estimated total time: 239.15 seconds
---
Starting batch 30000-40000
Batch 30000-40000: Downloaded 9759/10000 images
Progress: 40000/210009 images
Elapsed time: 45.34 seconds
Estimated remaining time: 192.69 seconds
Estimated total time: 238.02 seconds
---
Starting batch 40000-50000
Batch 40000-50000: Downloaded 9746/10000 images
Progress: 50000/210009 images
Elaps