In [None]:
import numpy as np
import pandas as pd
import pickle
import nest_asyncio
import asyncio
import aiohttp
import aiofiles
import io
import sys
import psutil
import re
import math
import gc
import os
from PIL import Image
from io import BytesIO
from tqdm.notebook import tqdm_notebook as tqdm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import requests
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import pipeline

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [None]:
image_to_text = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning", device=device)

In [None]:
df = pd.read_csv("/kaggle/input/adobetraindata/behaviour_simulation_train.csv")

In [None]:
df.head()

In [None]:
quantiles = df['likes'].quantile(np.linspace(0, 1, 8)).values
labels = [f'{int(quantiles[i])}-{int(quantiles[i + 1])}' for i in range(len(quantiles) - 1)]
df['likes_binned'] = pd.cut(df['likes'], bins=quantiles, labels=labels, include_lowest=True)

In [None]:
sample_fraction = 1/30  
df, _ = train_test_split(df, train_size=sample_fraction, random_state=42, stratify=df['likes_binned'])

In [None]:
print(df['likes_binned'].value_counts(normalize=True).sort_index())

In [None]:
def extract_link(media):
    pattern = r"(?:Photo\(previewUrl|Video\(thumbnailUrl|Gif\(thumbnailUrl)='([^']*)'"
    match = re.search(pattern, media)
    if match:
        return match.group(1)
    return None

df['link'] = df['media'].apply(extract_link)

In [None]:
df.head()

In [None]:
async def fetch_image(session, url):
    async with session.get(url) as response:
        if response.status == 200:
            img_data = await response.read()
            return img_data

async def download_images(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch_image(session, url) for url in urls]
        return await asyncio.gather(*tasks)

df['image'] = await download_images(list(df['link']))

In [None]:
df = df.reset_index(drop=True)
df.shape
df_valid = df[df['image'].notna()].reset_index(drop=True)

In [None]:
class FeatureDataset(nn.Module):
    def __init__(self, df):
        self.df = df
    
    def __len__(self):
        return(self.df.shape[0])
    
    def __getitem__(self, index):
        return {
            'id' : int(self.df['id'][index]),
            'img': Image.open(io.BytesIO(self.df['image'][index])).convert('RGB')
            }

In [None]:
temp_li_data = FeatureDataset(df_valid[['id', 'image']])

In [None]:
def collate_fn(batch):
    ids = [item['id'] for item in batch]
    images = [item['img'] for item in batch]
    
    return {
        'id': ids,
        'img': images
    }

In [None]:
temp_li_load = DataLoader(temp_li_data, batch_size = 128, shuffle = False, num_workers = 4, collate_fn=collate_fn)

In [None]:
li = []

for data in tqdm(temp_li_load):
    torch.cuda.empty_cache()
    ids = data['id']
    images = data['img']

    with torch.no_grad():
        generated_text = image_to_text(images)
          
    li.extend((t, text) for t, text in zip(ids, generated_text)) 
    
    # Clear memory
    del images, ids, generated_text
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
df_valid