In [1]:
import open_clip
import torch
from torch import nn
import torchvision.transforms as T

In [6]:
class Normalization(nn.Module):
    def __init__(self, shape):
        super().__init__()
        self.register_buffer('mean', torch.zeros(shape))
        self.register_buffer('variance', torch.ones(shape))

    def forward(self, x):
        return (x - self.mean) / self.variance.sqrt()
    

class NSFWModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.norm = Normalization([768])
        self.linear_1 = nn.Linear(768, 64)
        self.linear_2 = nn.Linear(64, 512)
        self.linear_3 = nn.Linear(512, 256)
        self.linear_4 = nn.Linear(256, 1)
        self.act = nn.ReLU()
        self.act_out = nn.Sigmoid()

    def forward(self, x):
        x = self.norm(x)
        x = self.act(self.linear_1(x))
        x = self.act(self.linear_2(x))
        x = self.act(self.linear_3(x))
        x = self.act_out(self.linear_4(x))
        return x

In [10]:
class Normalization(nn.Module):
    def __init__(self, shape):
        super().__init__()
        self.register_buffer('mean', torch.zeros(shape))
        self.register_buffer('variance', torch.ones(shape))

    def forward(self, x):
        return (x - self.mean) / self.variance.sqrt()
    

class NSFWModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.norm = Normalization([512])
        self.linear_1 = nn.Linear(512, 32)
        self.linear_2 = nn.Linear(32, 32)
        self.linear_3 = nn.Linear(32, 1)
        self.dropout = nn.Dropout(0.5)
        self.act = nn.ReLU()
        self.act_out = nn.Sigmoid()

    def forward(self, x):
        x = self.norm(x)
        x = self.act(self.linear_1(x))
        x = self.dropout(self.act(self.linear_2(x)))
        x = self.act_out(self.linear_3(x))
        return x

In [11]:
model = NSFWModel()
checkpoint = torch.load("./clip_autokeras_binary_nsfw_b32.pth")
model.load_state_dict(checkpoint)
model.eval()

NSFWModel(
  (norm): Normalization()
  (linear_1): Linear(in_features=512, out_features=32, bias=True)
  (linear_2): Linear(in_features=32, out_features=32, bias=True)
  (linear_3): Linear(in_features=32, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (act): ReLU()
  (act_out): Sigmoid()
)

In [2]:
from h14_nsfw_model import H14_NSFW_Detector

model = H14_NSFW_Detector()
checkpoint = torch.load("./h14_nsfw.pth")
model.load_state_dict(checkpoint)
model.eval()

H14_NSFW_Detector(
  (layers): Sequential(
    (0): Linear(in_features=1024, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=1024, out_features=2048, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=2048, out_features=1024, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=1024, out_features=256, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.2, inplace=False)
    (12): Linear(in_features=256, out_features=128, bias=True)
    (13): ReLU()
    (14): Dropout(p=0.2, inplace=False)
    (15): Linear(in_features=128, out_features=16, bias=True)
    (16): Linear(in_features=16, out_features=1, bias=True)
  )
)

In [2]:
open_clip.list_pretrained()

[('RN50', 'openai'),
 ('RN50', 'yfcc15m'),
 ('RN50', 'cc12m'),
 ('RN50-quickgelu', 'openai'),
 ('RN50-quickgelu', 'yfcc15m'),
 ('RN50-quickgelu', 'cc12m'),
 ('RN101', 'openai'),
 ('RN101', 'yfcc15m'),
 ('RN101-quickgelu', 'openai'),
 ('RN101-quickgelu', 'yfcc15m'),
 ('RN50x4', 'openai'),
 ('RN50x16', 'openai'),
 ('RN50x64', 'openai'),
 ('ViT-B-32', 'openai'),
 ('ViT-B-32', 'laion400m_e31'),
 ('ViT-B-32', 'laion400m_e32'),
 ('ViT-B-32', 'laion2b_e16'),
 ('ViT-B-32', 'laion2b_s34b_b79k'),
 ('ViT-B-32', 'datacomp_m_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_clip_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_laion_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_image_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_text_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_basic_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_s128m_b4k'),
 ('ViT-B-32', 'datacomp_s_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_clip_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_laion_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_image_s13m_b4k'),
 ('ViT-B-32', 'commo

In [12]:
clip, tt, pp = open_clip.create_model_and_transforms('ViT-B-32', 'openai')
backbone = clip.visual
backbone.eval()

100%|███████████████████████████████████████| 354M/354M [00:36<00:00, 9.70MiB/s]


VisionTransformer(
  (patchnorm_pre_ln): Identity()
  (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
  (patch_dropout): Identity()
  (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (transformer): Transformer(
    (resblocks): ModuleList(
      (0-11): 12 x ResidualAttentionBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ls_1): Identity()
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ls_2): Identity()
      )
    )
  )
  (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [13]:
pre_process = T.Compose([
                T.Resize(
                    size=(224, 224), 
                    interpolation=T.InterpolationMode.BICUBIC,
                    antialias=True),
                T.ToTensor(), 
                T.Normalize(
                    mean=(0.48145466, 0.4578275, 0.40821073), 
                    std=(0.26862954, 0.26130258, 0.27577711)
                )
            ])

In [40]:
# Imports PIL module 
from PIL import Image
import torch.nn.functional as F

  
# open method used to open different extension image file
im = Image.open("./test_imgs/nsfw/Wiki-cumshot.png") 

In [41]:
def remove_transparency(im, bg_colour=(255, 255, 255)):

    if im.mode in ('RGBA', 'LA') or (im.mode == 'P' and 'transparency' in im.info):
        alpha = im.convert('RGBA').split()[-1]
        bg = Image.new("RGB", im.size, (255, 255, 255))
        bg.paste(im, mask=alpha)
        return bg
    elif im.mode == 'P':
        bg = Image.new("RGB", im.size, (255, 255, 255))
        bg.paste(im)
        return bg
    else:
        return im

In [42]:
with torch.no_grad():
    im = remove_transparency(im)
    x = pre_process(im)
    x = x.reshape(1, 3, 224, 224)
    p = backbone(x) 
    p = F.normalize(p)
    c = model(p)
    print(c)

tensor([[1.]])


In [2]:
import numpy as np
import pandas as pd

In [5]:
df = pd.read_parquet('./nsfw_testset/drawings-test/metadata/metadata_0.parquet')
df

Unnamed: 0,image_path
0,content/drawing/Image_1.jpg
1,content/drawing/Image_10.jpg
2,content/drawing/Image_100.jpg
3,content/drawing/Image_1000.jpg
4,content/drawing/Image_1001.jpg
...,...
848,content/drawing/Image_994.jpg
849,content/drawing/Image_995.jpg
850,content/drawing/Image_997.jpg
851,content/drawing/Image_998.jpg


In [3]:
a = np.load('./nsfw_testset/drawings-test/img_emb/img_emb_0.npy')
a.shape

(853, 768)

In [None]:
np.linalg.norm(a, axis=1)

In [25]:
df.head()

Unnamed: 0,image_path
0,content/nsfw_data_scraper/data/train/drawings/...
1,content/nsfw_data_scraper/data/train/drawings/...
2,content/nsfw_data_scraper/data/train/drawings/...
3,content/nsfw_data_scraper/data/train/drawings/...
4,content/nsfw_data_scraper/data/train/drawings/...


In [None]:
c

In [None]:
df

In [None]:
a = np.lad