In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, BertModel, BertConfig
import os

## Dataset

In [None]:
df = pd.read_excel("/media/souravsaini/Data/POP_OS/dl/pytorch/pytorch/adobe_mid_prep/behaviour_simulation_train.xlsx")
# df.head()

## GPU code

In [None]:
def get_default_device():
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

def to_device(data, device):
    if isinstance(data, (list, tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

def shift_device(data):
    if torch.cuda.is_available():
        return data.to("cuda")
    else return data

class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        for b in self.dl:
            yield to_device(b, self.device)

    def __len__(self):
        return len(self.dl)

device = get_default_device()
print(device)

cuda


## Tokenizer and Bert Model

In [None]:
from transformers import AutoTokenizer, BertModel

# Specify your special tokens
special_tokens = ["<mention>", "<hyperlink>"]

# Initialize the tokenizer with custom settings
tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-uncased",
    model_max_length=256,
    padding_side='right',
    additional_special_tokens=special_tokens,
    split_special_tokens=False,
    pad_token="<pad>",
    )

# Example usage
text = "Hello, this is a <mention> and here is a <hyperlink>."
tokens = tokenizer(text, max_length=256, truncation=True, padding='max_length', return_tensors='pt')
# tokens = tokenizer(text, return_tensors="pt")
print("Original text:", text)
print("Tokenized text:", tokens)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Original text: Hello, this is a <mention> and here is a <hyperlink>.
Tokenized text: {'input_ids': tensor([[  101,  7592,  1010,  2023,  2003,  1037, 30523,  1998,  2182,  2003,
          1037, 30524,  1012,   102, 30522, 30522, 30522, 30522, 30522, 30522,
         30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522,
         30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522,
         30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522,
         30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522,
         30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522,
         30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522,
         30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522,
         30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522,
         30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522, 30522,
         30522, 30522, 30522, 30

In [None]:
from transformers import BertModel, BertConfig

# Load pre-trained BERT model configuration
config = BertConfig(
    hidden_size=256,
    num_attention_heads=8,
    num_hidden_layers=8,
    max_position_embeddings=256,  # Set the maximum sequence length
    vocab_size = 30525,
)

# Create a custom BERT model
model = BertModel(config)
to_device(model, device)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30525, 256, padding_idx=0)
    (position_embeddings): Embedding(256, 256)
    (token_type_embeddings): Embedding(2, 256)
    (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=256, out_features=256, bias=True)
            (key): Linear(in_features=256, out_features=256, bias=True)
            (value): Linear(in_features=256, out_features=256, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=256, out_features=256, bias=True)
            (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
model.config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 256,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 256,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 8,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30525
}

In [None]:
def get_features_text(text):
  new_tokens = tokenizer(text, max_length=256, truncation=True, padding='max_length', return_tensors='pt')
  new_tokens = shift_device(new_tokens)
  outputs = model(**new_tokens)
  return outputs.last_hidden_state

## Visuals and audio feature extraction

In [None]:
import requests
from urllib.parse import urlsplit

def extract_file_extension(url):
    try:
        # Send a HEAD request to get the Content-Type header
        response = requests.head(url)

        # Check if Content-Type header is present
        if 'content-type' in response.headers:
            content_type = response.headers['content-type']

            # Extract the file extension from the Content-Type header
            file_extension = content_type.split('/')[-1]
            return file_extension
    except Exception as e:
        print(f"Error: {e}")

    # If Content-Type is not present or extraction fails, return None
    return ""

In [None]:
class VideoVariant:
    def __init__(self, contentType, url, bitrate):
        self.contentType = contentType
        self.url = url
        self.bitrate = bitrate

class Video:
    def __init__(self, thumbnailUrl, variants, duration, views):
        self.thumbnailUrl = thumbnailUrl
        self.variants = variants
        self.duration = duration
        self.views = views

class Photo:
    def __init__(self, previewUrl, fullUrl):
        self.previewUrl = previewUrl
        self.fullUrl = fullUrl

class Gif:
    def __init__(self, thumbnailUrl, variants):
        self.thumbnailUrl = thumbnailUrl
        self.variants = variants


def get_lowest_bitrate_video_or_gif(video_or_gif):
    if isinstance(video_or_gif, (Video, Gif)):
        variants = getattr(video_or_gif, 'variants', [])
        # Filter out variants with bitrate set to None
        valid_variants = [v for v in variants if v.bitrate is not None]
        if valid_variants:
            lowest_bitrate_variant = min(valid_variants, key=lambda v: v.bitrate)
            return lowest_bitrate_variant
    return None


def parse_objects_string(objects_string):
    try:
        # Using eval to parse the string and create objects
        parsed_objects = eval(objects_string)
        return parsed_objects
    except Exception as e:
        print(f"Error parsing objects string: {e}")
        return None

def get_lowest_bitrate_video(video):
    if isinstance(video, Video):
        variants = getattr(video, 'variants', [])
        # Filter out variants with bitrate set to None
        valid_variants = [v for v in variants if v.bitrate is not None]
        if valid_variants:
            lowest_bitrate_variant = min(valid_variants, key=lambda v: v.bitrate)
            return lowest_bitrate_variant
    return None

In [None]:
import re
import subprocess
from urllib.parse import unquote

def download_media(media_info_str, base_download_dir, entry_index):
    # store paths
    paths = []
     # Create a folder for each entry
    folder_name = f"entry_{entry_index}"
    download_dir = os.path.join(base_download_dir, folder_name)
    os.makedirs(download_dir, exist_ok=True)

    # Remove line breaks and parse the objects string
    objects_string = media_info_str.replace('\n', '')
    parsed_objects = parse_objects_string(objects_string)

    index = 0
    # Accessing the attributes of the created objects
    for parsed_object in parsed_objects:
        media_url = ""
        if isinstance(parsed_object, (Video, Gif)):
            lowest_bitrate_variant = get_lowest_bitrate_video(parsed_object)
            if lowest_bitrate_variant:
                media_url = lowest_bitrate_variant.url
        elif isinstance(parsed_object, Photo):
            media_url = parsed_object.fullUrl
        if media_url == "":
          continue
        file_name = str(index+1)+"."+extract_file_extension(media_url)
        subprocess.run(['curl', media_url, '-o', file_name], check=True,cwd = download_dir)
        paths.append(download_dir+'/'+file_name)
        index += 1
    return paths

In [None]:
class ModifiedViT(nn.Module):
    def __init__(self, original_model, output_size):
        super(ModifiedViT, self).__init__()
        self.features = original_model
        self.head = nn.Linear(original_model.config.num_labels, output_size)

    def forward(self, inputs):
        x = self.features(**inputs).logits
        x = self.head(x)
        x = x.view(-1, 1)  # Reshape the tensor to (256*256, 1)
        return x

In [None]:
import torch.nn as nn
import torch
import torchvision.transforms as transforms
from PIL import Image
from transformers import ViTFeatureExtractor, ViTForImageClassification
import imageio
import cv2
import numpy as np
from sklearn.decomposition import PCA

# Define your desired output size and number of frames
desired_output_size = 256*256
num_frames = 32

# Create ViTFeatureExtractor and ViTForImageClassification
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
# to_device(feature_extractor, device)

# Modify the model to produce the desired output size
original_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
to_device(original_model, device)
modified_model = ModifiedViT(original_model, desired_output_size)
to_device(modified_model, device)

# Function to extract features from a file path
def extract_features(file_path, reduction_method="pca"):
    # Check if the file is a GIF or video
    if file_path.endswith(('.gif', '.apng', '.avif', '.webp')):
        gif = imageio.get_reader(file_path)
        total_frames = len(gif)
        step_size = max(1, total_frames // num_frames)  # Calculate step size
        frames = [Image.fromarray(gif.get_data(i)) for i in range(0, total_frames, step_size)]
    elif file_path.endswith(('.mp4', '.avi', '.mkv', 'webm', '.mov')):
        video = cv2.VideoCapture(file_path)
        total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        step_size = max(1, total_frames // num_frames)  # Calculate step size
        frames = []
        for i in range(0, total_frames, step_size):
            video.set(cv2.CAP_PROP_POS_FRAMES, i)
            ret, frame = video.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(Image.fromarray(frame))
        video.release()
    else:
        # Assume it's an image file
        frames = [Image.open(file_path)]

    # Extract features from each frame
    features_list = []
    for frame in frames:
        # Preprocess the image
        frame = frame.convert("RGB")  # Ensure RGB format
        inputs = feature_extractor(images=frame, return_tensors="pt", to="pt")
        inputs = shift_device(inputs)
        output_vector = modified_model(inputs)
        output_vector = output_vector
        features_list.append(output_vector.flatten().detach())  # Flatten to (256*256,) and detach gradients

    # Stack to get (256*256, num_frames)
    features_array = torch.stack(features_list, axis=1)

    # Reduction by PCA, or mean (NOTE: Significantly slow on mp4 files)
    if(reduction_method=="pca"):
        pca = PCA(n_components=1)
        feature_vector = pca.fit_transform(features_array)

        return feature_vector
    else:
        feature_vector = np.mean(features_array, axis=1, keepdims=True)

        return feature_vector


Downloading:   0%|          | 0.00/160 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/346M [00:00<?, ?B/s]

Tester (Do NOT run)

## combining the textual features and visual features using cross attention

In [None]:
class CrossAttention(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super(CrossAttention, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_heads = num_heads

        # Linear transformation for queries
        self.linear_q = nn.Linear(hidden_dim, hidden_dim * num_heads)

        # Linear transformation for keys and values
        self.linear_kv = nn.Linear(hidden_dim, hidden_dim * num_heads * 2)

        # Linear transformation for the output of attention
        self.linear_out = nn.Linear(hidden_dim * num_heads, hidden_dim)

        # Multihead attention
        self.multihead_attn = nn.MultiheadAttention(hidden_dim, num_heads)

    def forward(self, text_features, media_features):
        # Linear transformations for queries, keys, and values
        q = self.linear_q(text_features)
        kv = self.linear_kv(media_features)

        # Split linear_kv output into keys and values
        k, v = torch.split(kv, split_size_or_sections=self.hidden_dim * self.num_heads, dim=-1)

        # Reshape for multihead attention
        q = q.view(q.size(0), -1, self.hidden_dim)
        k = k.view(k.size(0), -1, self.hidden_dim)
        v = v.view(v.size(0), -1, self.hidden_dim)

        # Multihead attention
        attn_output, _ = self.multihead_attn(q, k, v)

        # Reshape and apply linear transformation
        attn_output = attn_output.view(attn_output.size(0), -1)
        output = self.linear_out(attn_output)

        return output

class FeatureCombiningModel(nn.Module):
    def __init__(self, hidden_dim, num_heads):
        super(FeatureCombiningModel, self).__init__()

        # Cross-Attention module
        self.cross_attention = CrossAttention(hidden_dim, num_heads)
        to_device(self.cross_attention, device)

    def forward(self, text_features, media_features):
        # Apply cross-attention
        cross_attn_output = self.cross_attention(text_features, media_features)

        return cross_attn_output


# Instantiate the custom model
CombineFeature = FeatureCombiningModel(hidden_dim=256, num_heads=8)
to_device(CombineFeature, device)


FeatureCombiningModel(
  (cross_attention): CrossAttention(
    (linear_q): Linear(in_features=256, out_features=2048, bias=True)
    (linear_kv): Linear(in_features=256, out_features=4096, bias=True)
    (linear_out): Linear(in_features=2048, out_features=256, bias=True)
    (multihead_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
    )
  )
)

Loading as .npy


In [None]:
# # for each entry, take "full" portion, process, and write back as ".npy" file
# base_download_dir = 'downloaded_media'
# base_path = '/media/souravsaini/Data/POP_OS/dl/pytorch/pytorch/adobe_mid_prep/Dataset/JonnySet'
# for index, row in df.iterrows():
#     media_info_str = row['media']
#     paths = download_media(media_info_str, base_download_dir, index + 1)
#     image_feature = np.zeros(shape=(256*256,1))
#     try:
#       for path in paths:
#         # doubt:  condition for image to be processed
#         image_feature  +=  extract_features(path,"")
#       text_info_str = row["content"]
#       text_features = get_features_text(text_info_str)
#       image_feature = torch.from_numpy(image_feature)
#       image_feature = image_feature.to(torch.float32)
#       text_features = text_features.to(torch.float32).reshape(256,256)
#       features = CombineFeature(image_feature.reshape(256, 256), text_features.reshape(256,256))
#       as_numpy = features.detach().numpy()
#       torch.save(as_numpy,os.path.join(base_path,f'{index+1}.npy'))
#     except:
#       print("skipped:"+str(index+1))
#     print(index+1)
#     # save as numpy

Do not run

In [None]:
# from joblib import Parallel, delayed

# features_list = []
# labels = []
# def process(index, row):
#   media_info_str = row['media']
#   paths = download_media(media_info_str, base_download_dir, index + 1)
#   image_feature = np.zeros(shape=(256*256,1))
#   try:
#     for path in paths:
#       # doubt:  condition for image to be processed
#       image_feature  +=  extract_features(path, "")
#     text_info_str = row["content"]
#     text_features = get_features_text(text_info_str)
#     image_feature = torch.from_numpy(image_feature)
#     image_feature = image_feature.to(torch.float32).to("cuda")
#     text_features = text_features.to(torch.float32).reshape(256,256).to("cuda")
#     features = CombineFeature(image_feature.reshape(256, 256), text_features.reshape(256,256))
#     # as_numpy = features.detach().cpu()
#     features_list.append(features.detach().cpu())
#     labels.append(torch.tensor(row['likes']))
#     # torch.save(as_numpy,os.path.join(base_path,f'{index+1}.npy'))
#   except:
#     print("skipped:"+str(index+1))
#   print(index+1)
#   return 0
#   # return features_list, labels

In [None]:
# results = Parallel(n_jobs=2)(delayed(process)(i, r) for i, r in df.iterrows())

In [None]:
# # for each entry, take "full" portion, process, and write back as ".npy" file
# # from joblib import Parallel, delayed

# base_download_dir = 'downloaded_media'
# base_path = '/media/souravsaini/Data/POP_OS/dl/pytorch/pytorch/adobe_mid_prep/Dataset/JonnySet'
# features_list = []
# labels = []
# for index, row in df.iterrows():
#     if index = 30000:
#       break
#     media_info_str = row['media']
#     paths = download_media(media_info_str, base_download_dir, index + 1)
#     image_feature = np.zeros(shape=(256*256,1))
#     try:
#       for path in paths:
#         # doubt:  condition for image to be processed
#         image_feature  +=  extract_features(path, "")
#       text_info_str = row["content"]
#       text_features = get_features_text(text_info_str)
#       image_feature = torch.from_numpy(image_feature)
#       image_feature = image_feature.to(torch.float32).to("cuda")
#       text_features = text_features.to(torch.float32).reshape(256,256).to("cuda")
#       features = CombineFeature(image_feature.reshape(256, 256), text_features.reshape(256,256))
#       # as_numpy = features.detach().cpu()
#       features_list.append(features.detach().cpu())
#       labels.append(torch.tensor(index))
#       # torch.save(as_numpy,os.path.join(base_path,f'{index+1}.npy'))
#     except:
#       print("skipped:"+str(index+1))
#     print(index+1)

# stacked_features = torch.stack(features_list, dim=0)
# labels = torch.stack(labels, dim=0)

SyntaxError: invalid syntax. Maybe you meant '==' or ':=' instead of '='? (1833612486.py, line 9)

In [None]:
base_download_dir = 'downloaded_media'
base_path = '/content/drive/MyDrive/mid_prep/dataset'
strt_idx = 96852   # change here
diff = 3150
for i in range(strt_idx, strt_idx+diff+1): #[)
  media_info_str = df.iat[i, 5]
  paths = download_media(media_info_str, base_download_dir, i + 1)
  image_list = []
  try:
    for path in paths:
    # doubt:  condition for image to be processed
    image_list.append(extract_features(path, ""))
    image_feature = torch.stack(image_list, dim=0)

    # image_feature = torch.from_numpy(image_feature)
    image_feature = image_feature.to(torch.float32).reshape(-1,256,256).detach()
    # as_numpy = features.detach().cpu()
    np.save(base_path + "/" + i + ".pt", image_feature)
#   labels.append(i)
# torch.save(as_numpy,os.path.join(base_path,f'{index+1}.npy'))
  except:
    print("skipped:"+str(i+1))
  print(i+1)


Tester code