# Поиск fraud - отзывов

**Постановка задачи**:
Пользователи загружают изображения в раздел отзывы. Недобросовестные пользователи используют отзывы для рекламы своих товаров или услуг.

Так могут выглядеть допустимые изображений:


<img src ="https://ml.gan4x4.ru/wb/text/content/not_fraud.png" width="800">

Вот примеры недопустимых изображений:

<img src ="https://ml.gan4x4.ru/wb/text/content/fraud.png" width="800">

Как правило такие изображения содержат контактные данные: номера телефонов, ссылку на сайт, телеграмм аккаунт и.т.п.




**Требуется** :

Разработать алгоритм/програмную систему позволяющую отличить допустимые фото от недопустимых. Сложность  обуславливается тем, что допустимые фотографии тоже могут содержать текст.


**Ограничения**:
Допускается использовать любые модели/алгоритмы/данные находящиеся в свободном доступе.


**Важно!**

Блокнот должен содержать весь необходимый код для запуска финальной модели.
В том числе код для установки сторонних библиотек. Если для запуска требуется подгрузка весов, все ссылки длжны работать не только в вашем аккаунте но и в аккаунте преподавателя. Проверьте что блок для проверки запускается в colab.

## Данные

Датасет состоит из ~20000 изображений. Изображения находящиеся в папке “0” - допустимые. В папке “1” - фродовые. Ниже код для его загрузки.



In [1]:
# Code for data download
!wget https://ml.gan4x4.ru/wb/text/75_20000/student.zip
!unzip student.zip

--2024-07-20 07:30:16--  https://ml.gan4x4.ru/wb/text/75_20000/student.zip
Resolving ml.gan4x4.ru (ml.gan4x4.ru)... 212.24.105.216
Connecting to ml.gan4x4.ru (ml.gan4x4.ru)|212.24.105.216|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3277623512 (3.1G) [application/zip]
Saving to: ‘student.zip’


2024-07-20 07:35:26 (10.1 MB/s) - ‘student.zip’ saved [3277623512/3277623512]

Archive:  student.zip
   creating: student/
   creating: student/0/
  inflating: student/0/157519745.jpg  
  inflating: student/0/166006669.jpg  
  inflating: student/0/167714046.jpg  
  inflating: student/0/169037729.jpg  
  inflating: student/0/169498881.jpg  
  inflating: student/0/136422834.jpg  
  inflating: student/0/134546524.jpg  
  inflating: student/0/167950887.jpg  
  inflating: student/0/165053993.jpg  
  inflating: student/0/166554729.jpg  
  inflating: student/0/169264647.jpg  
  inflating: student/0/167947606.jpg  
  inflating: student/0/160822861.jpg  
  inflating: student/

# Решение

## Import libraries | Set Configs | Implementation classes

In [1]:
# --------------Libraries --------------- #
from glob import glob


import os, gc
import io
from IPython.display import clear_output
from contextlib import redirect_stdout
import time,random
from typing import Optional, Tuple, Union, Callable

import numpy as np
import pandas as pd
import pathlib
import inspect
import matplotlib.pyplot as plt
import cv2
from PIL import Image
from tqdm.notebook import tqdm

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, hamming_loss, ConfusionMatrixDisplay
from mlcm import mlcm


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader , random_split
from torch.utils.data.sampler import SubsetRandomSampler
from pytorch_multilabel_balanced_sampler.samplers import LeastSampledClassSampler

from torchvision.transforms import v2
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torchvision.datasets import vision
from torchsummary import summary

from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks
from torchvision import tv_tensors


import timm
import wandb

from pytorch_lightning import LightningDataModule, LightningModule, Trainer 
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks.early_stopping import EarlyStopping

torch.set_float32_matmul_precision('medium')
print("Done!")

INFO:albumentations.check_version:A new version of Albumentations is available: 1.4.11 (you have 1.4.10). Upgrade using: pip install --upgrade albumentations


Done!


In [2]:
# ---Configuration class contains training configs---- #
#                                                      #
#                  &  model configs                    #
# ---------------------------------------------------- #
class Config:
        BATCH_SIZE = 2
        SEED = 42
        LEARNING_RATE = 0.00878938382303729
        EPS = 0.005663097332083254
        WD = 0.08157514864673669
        TYPE  = "Binary Classification"
        DATA_SOURCE = "binary-image-classification-dataset"
        MODEL_NAME = 'resnetv2_50'
        CRITERION_ = "Binary Cross Entropy"
        OPTIMIZER_ = "AdamW"
        DATA_TYPE = 'image'
        
        EPOCHS = 100
        NUM_WORKERS=8

        PROJECT_NAME='WB intership'
        TASK_NAME='binary-classification'

        IMG_SIZE = (224,224)

        mean = [0.5061, 0.4890, 0.4901]
        std  = [0.4247, 0.4200, 0.4184]
       
        
        def __init__(self):
            print("configuration set!")
        
        def check_cuda(self):
            print("Scanning for CUDA")
            if torch.cuda.is_available():
                print("GPU is available , training will be accelerated! : )\n")
            else:
                print("NO GPUs found : / \n")
        
        def seed_everything(self):
            print("Seeding...")
            np.random.seed(self.SEED)
            random.seed(self.SEED)
            os.environ['PYTHONHASHSEED'] = str(self.SEED)
            torch.manual_seed(self.SEED)
            torch.cuda.manual_seed(self.SEED)
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False
            print("Seeded everything!")


Config.train_augmentations = v2.Compose([
                v2.RandomHorizontalFlip(p=0.5),v2.RandomVerticalFlip(p=0.5),
                v2.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
                v2.RandomPerspective(distortion_scale=0.4, p=0.5),
                # v2.ElasticTransform(alpha=250.0, sigma=10),
                # v2.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5.)),
                v2.RandomPosterize(bits=2),
            ])


Config.train_transforms = v2.Compose([
                Config.train_augmentations,
                v2.Resize(Config.IMG_SIZE),
                v2.Normalize(Config.mean, Config.std),
])


Config.test_transforms = v2.Compose([
                v2.Resize(Config.IMG_SIZE),
                v2.Normalize(Config.mean, Config.std),
        ])

CFG = Config()


CFG.check_cuda()
CFG.seed_everything()
config = dict(inspect.getmembers(CFG, lambda a:not(inspect.isroutine(a))))

configuration set!
Scanning for CUDA
GPU is available , training will be accelerated! : )

Seeding...
Seeded everything!


In [26]:
class FraudDataset(vision.VisionDataset):
    """`Wildberries Dataset with good and fraud products <https://ml.gan4x4.ru/wb/text/75_20000/student.zip>

    Args:
        root (str or ``pathlib.Path``): Root directory of dataset where directory
            ``student.zip`` exists or will be saved to if download is set to True.
        train (bool, optional): If True, creates dataset from training set, otherwise
            creates from test set.
        transform (callable, optional): A function/transform that takes in a PIL image
            and returns a transformed version. E.g, ``transforms.RandomCrop``
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.

    """

    url = "https://ml.gan4x4.ru/wb/text/75_20000/student.zip"
    filename = "student.zip"
    data_format = '.jpg'
    

    def __init__(
        self,
        root: Union[str, pathlib.Path] = '',
        train: bool = True,
        split: float = 0.3,
        transform: Optional[Callable] = None,
        download: bool = False,
    ) -> None:

        super().__init__(root, transform=transform)
       
        self.good_dir = pathlib.Path(self.root, "./student/0/")
        self.fraud_dir = pathlib.Path(self.root, "./student/1/")
        self.train = train  # training set or test set

        if download:
            self.download()

        if not self._check_integrity():
            raise RuntimeError("Dataset not found or corrupted. You can use download=True to download it")

        good_image_index = glob(str(self.good_dir / "./*"))
        fraud_image_index = glob(str(self.fraud_dir / "./*"))

        self.full_dataset = pd.DataFrame({'path': good_image_index + fraud_image_index, 
                                          'target':[0] * len(good_image_index) + [1] * len(fraud_image_index)})
        # self.full_dataset = self.full_dataset.loc[~self.full_dataset["path"].isin(self.data_sanity_check())]
        
       
        train_data, test_data = random_split(self.full_dataset.sample(frac=1), lengths=[1-split, split])
        self.data = train_data if self.train else test_data
        
        


    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        img = cv2.imread(self.full_dataset.iloc[idx,0])
        img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
        img = img / 255.
        img = torch.Tensor(img).permute([2,1,0]) # convert to HWC
        
        label = torch.tensor(self.full_dataset.iloc[idx, 1], dtype=torch.bool)

        if self.transform:                                
            return self.transform(img), label
        return img, label
    

    def __len__(self) -> int:
        return len(self.data)

    def _check_integrity(self) -> bool:
        return os.path.isdir(self.good_dir) and os.path.isdir(self.fraud_dir)

    def download(self) -> None:
        if self._check_integrity():
            print("Files already downloaded")
            return

        pathlib.Path(self.root).mkdir(parents=True, exist_ok=True)
        os.system(f'wget {FraudDataset.url} -o {pathlib.Path(self.root, self.filename)}')
        os.system(f'unzip {FraudDataset.filename} -d {self.root}')
        

    def extra_repr(self) -> str:
        split = "Train" if self.train is True else "Test"
        return f"Split: {split}"
    
    def data_sanity_check(self):
        """
            this will check each image file for corrupted or missing and 
            returns index of corrupted / missing files .Doing this will
            prevent us from running into any data errors during training phase .
        """
        idx = []
        start = time.time()
        for i in range(len(self.full_dataset)):
            try:#       checks for corrupeted or missing image files
                if len(cv2.imread(self.full_dataset.iloc[i,0])) == 3:
                    _ = 1
            except:
                idx.append(self.full_dataset.iloc[i,0])
        end = time.time()
        print(end-start)
        _ = gc.collect()
        print(idx)
        return idx

In [27]:
class FraudDataModule(LightningDataModule):
    
    def __init__(self, dataset_class: torch.utils.data.Dataset, 
                 batch_size=CFG.BATCH_SIZE, 
                 split=0.3, 
                 train_transform=None,
                 val_transform=None):
        super().__init__()
        
        params = dict(root=pathlib.Path(os.getcwd()) / "content", 
                      download=False)
        
        self.train = dataset_class(train=True, transform=train_transform, split=split, **params)
        dev = dataset_class(train=False, transform=val_transform, split=split, **params)
        self.val, self.test = random_split(dev, lengths=[1-split, split])
        
        self.batch_size = batch_size
        
    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train,
                          batch_size=self.batch_size, num_workers=CFG.NUM_WORKERS)

    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.val.dataset,
                          batch_size=self.batch_size, num_workers=CFG.NUM_WORKERS)
        
    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.test,
                                           batch_size=1, num_workers=CFG.NUM_WORKERS)

In [28]:
df = FraudDataModule(FraudDataset, CFG.BATCH_SIZE)

In [6]:
len(df.train), df.train.data.indices

(13866,
 [10054,
  1607,
  2694,
  3517,
  5030,
  631,
  8134,
  14471,
  3558,
  16435,
  7270,
  11892,
  17826,
  5987,
  14915,
  16871,
  1891,
  8148,
  19285,
  12062,
  16773,
  13869,
  6441,
  18514,
  8795,
  9809,
  19585,
  1807,
  17915,
  12024,
  17996,
  6765,
  11307,
  712,
  10795,
  17817,
  2787,
  19474,
  11310,
  6590,
  9378,
  13136,
  14159,
  10462,
  14447,
  2108,
  2463,
  16590,
  16701,
  17980,
  9680,
  14250,
  4218,
  13787,
  4416,
  9943,
  7887,
  4941,
  8286,
  17234,
  3094,
  14089,
  16462,
  7838,
  16934,
  14071,
  12361,
  18197,
  3819,
  13734,
  7619,
  9170,
  6816,
  15236,
  12984,
  19302,
  12603,
  9669,
  10686,
  4465,
  16790,
  12356,
  10028,
  15761,
  4354,
  12879,
  12981,
  14208,
  12723,
  11574,
  7037,
  17458,
  13409,
  7667,
  179,
  5275,
  9417,
  3736,
  5449,
  15156,
  14262,
  11078,
  18893,
  17933,
  13977,
  115,
  4873,
  12996,
  6853,
  17913,
  3617,
  16251,
  18487,
  19724,
  18544,
  18691,
 

In [25]:
target = df.train.full_dataset.iloc[0, 1]
label = torch.tensor(target, dtype=torch.bool)
label

tensor(False)

In [29]:
next(iter(df.val_dataloader()))

[tensor([[[[0.6000, 0.6392, 0.6235,  ..., 0.2431, 0.2510, 0.2471],
           [0.6000, 0.6118, 0.6235,  ..., 0.2275, 0.2235, 0.2078],
           [0.5882, 0.5686, 0.6078,  ..., 0.2235, 0.2157, 0.1922],
           ...,
           [0.9961, 0.9961, 1.0000,  ..., 0.3098, 0.3059, 0.3294],
           [1.0000, 0.9961, 0.9961,  ..., 0.3137, 0.3176, 0.3020],
           [1.0000, 0.9961, 0.9961,  ..., 0.2863, 0.3255, 0.3765]],
 
          [[0.5137, 0.5529, 0.5412,  ..., 0.2235, 0.2314, 0.2275],
           [0.5137, 0.5294, 0.5412,  ..., 0.2078, 0.2039, 0.1882],
           [0.5020, 0.4863, 0.5255,  ..., 0.2039, 0.1961, 0.1725],
           ...,
           [0.8941, 0.8941, 0.8980,  ..., 0.2627, 0.2588, 0.2824],
           [0.9059, 0.9020, 0.9020,  ..., 0.2667, 0.2706, 0.2549],
           [0.9137, 0.9098, 0.9020,  ..., 0.2392, 0.2784, 0.3294]],
 
          [[0.4196, 0.4588, 0.4353,  ..., 0.2000, 0.2078, 0.2039],
           [0.4196, 0.4235, 0.4353,  ..., 0.1843, 0.1804, 0.1647],
           [0.4078, 0.38

In [33]:
count = 0
for batch in df.val_dataloader():
    # print(batch)
    print(batch)

print(count)

[tensor([[[[0.6000, 0.6392, 0.6235,  ..., 0.2431, 0.2510, 0.2471],
          [0.6000, 0.6118, 0.6235,  ..., 0.2275, 0.2235, 0.2078],
          [0.5882, 0.5686, 0.6078,  ..., 0.2235, 0.2157, 0.1922],
          ...,
          [0.9961, 0.9961, 1.0000,  ..., 0.3098, 0.3059, 0.3294],
          [1.0000, 0.9961, 0.9961,  ..., 0.3137, 0.3176, 0.3020],
          [1.0000, 0.9961, 0.9961,  ..., 0.2863, 0.3255, 0.3765]],

         [[0.5137, 0.5529, 0.5412,  ..., 0.2235, 0.2314, 0.2275],
          [0.5137, 0.5294, 0.5412,  ..., 0.2078, 0.2039, 0.1882],
          [0.5020, 0.4863, 0.5255,  ..., 0.2039, 0.1961, 0.1725],
          ...,
          [0.8941, 0.8941, 0.8980,  ..., 0.2627, 0.2588, 0.2824],
          [0.9059, 0.9020, 0.9020,  ..., 0.2667, 0.2706, 0.2549],
          [0.9137, 0.9098, 0.9020,  ..., 0.2392, 0.2784, 0.3294]],

         [[0.4196, 0.4588, 0.4353,  ..., 0.2000, 0.2078, 0.2039],
          [0.4196, 0.4235, 0.4353,  ..., 0.1843, 0.1804, 0.1647],
          [0.4078, 0.3804, 0.4118,  ..., 

RuntimeError: Caught RuntimeError in DataLoader worker process 1.
Original Traceback (most recent call last):
  File "/home/prog3/.cache/pypoetry/virtualenvs/intership-opmGnuy9-py3.10/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 308, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/home/prog3/.cache/pypoetry/virtualenvs/intership-opmGnuy9-py3.10/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 54, in fetch
    return self.collate_fn(data)
  File "/home/prog3/.cache/pypoetry/virtualenvs/intership-opmGnuy9-py3.10/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 316, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
  File "/home/prog3/.cache/pypoetry/virtualenvs/intership-opmGnuy9-py3.10/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 173, in collate
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/home/prog3/.cache/pypoetry/virtualenvs/intership-opmGnuy9-py3.10/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 173, in <listcomp>
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/home/prog3/.cache/pypoetry/virtualenvs/intership-opmGnuy9-py3.10/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 141, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
  File "/home/prog3/.cache/pypoetry/virtualenvs/intership-opmGnuy9-py3.10/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 212, in collate_tensor_fn
    out = elem.new(storage).resize_(len(batch), *list(elem.size()))
RuntimeError: Trying to resize storage that is not resizable


In [76]:
len(df.val.indices), df.val.indices

(4160,
 [5428,
  4529,
  5762,
  1302,
  2648,
  2771,
  3128,
  825,
  4907,
  1449,
  4023,
  5746,
  2698,
  45,
  5285,
  3436,
  4981,
  1996,
  1202,
  1307,
  855,
  3881,
  866,
  1098,
  4568,
  2916,
  1532,
  2078,
  781,
  1505,
  1527,
  1263,
  821,
  5613,
  2924,
  1543,
  1963,
  3918,
  5805,
  165,
  3155,
  3367,
  4264,
  2380,
  5143,
  3160,
  5098,
  4396,
  1251,
  4101,
  3911,
  5195,
  4740,
  2797,
  40,
  2316,
  4151,
  2467,
  670,
  2030,
  3677,
  2283,
  3017,
  3844,
  362,
  2110,
  4416,
  614,
  2154,
  5893,
  4480,
  2895,
  4434,
  120,
  4033,
  3835,
  1315,
  4713,
  5641,
  4374,
  5844,
  3837,
  4935,
  4134,
  1488,
  3177,
  1104,
  3723,
  2074,
  5430,
  2667,
  3799,
  3779,
  191,
  3680,
  228,
  4984,
  5257,
  4621,
  3305,
  3573,
  5889,
  2568,
  763,
  4751,
  5917,
  549,
  5683,
  5662,
  1397,
  5212,
  5787,
  1799,
  2936,
  3894,
  4906,
  617,
  2814,
  4194,
  3222,
  3295,
  2266,
  5494,
  837,
  3629,
  1329,
  981

In [77]:
len(df.test.indices), df.test.indices

(1782,
 [534,
  442,
  3413,
  4211,
  223,
  3986,
  3957,
  5416,
  3279,
  5619,
  5046,
  5245,
  4973,
  4611,
  1477,
  4352,
  2175,
  4579,
  3826,
  5255,
  840,
  3389,
  5595,
  5340,
  1826,
  3441,
  5497,
  287,
  576,
  5883,
  4894,
  2965,
  2831,
  5305,
  5821,
  619,
  388,
  3477,
  5536,
  1920,
  5365,
  2490,
  4247,
  2143,
  1109,
  1028,
  5823,
  4769,
  3435,
  5496,
  4594,
  3399,
  131,
  4932,
  1460,
  593,
  2935,
  3358,
  3633,
  4438,
  5361,
  3903,
  2420,
  5468,
  5527,
  2484,
  2872,
  3000,
  3865,
  2501,
  5429,
  1716,
  5133,
  5397,
  3858,
  2116,
  2841,
  4231,
  1561,
  4346,
  3171,
  2839,
  3353,
  5047,
  4887,
  401,
  5523,
  1159,
  482,
  2026,
  1005,
  1762,
  4066,
  1648,
  5370,
  2833,
  181,
  2394,
  5150,
  2910,
  1565,
  5388,
  598,
  4591,
  4393,
  3359,
  1434,
  5007,
  5930,
  4992,
  5596,
  350,
  5437,
  923,
  2069,
  798,
  1812,
  5320,
  4691,
  3212,
  3921,
  143,
  3772,
  257,
  3472,
  3151,
  22

In [78]:
len(df.train) + len(df.val.indices) + len(df.test.indices)

19808

In [79]:
df.train.full_dataset

Unnamed: 0,path,target
0,/home/prog3/notebooks/intership/ex3/content/st...,0
1,/home/prog3/notebooks/intership/ex3/content/st...,0
2,/home/prog3/notebooks/intership/ex3/content/st...,0
3,/home/prog3/notebooks/intership/ex3/content/st...,0
4,/home/prog3/notebooks/intership/ex3/content/st...,0
...,...,...
19803,/home/prog3/notebooks/intership/ex3/content/st...,1
19804,/home/prog3/notebooks/intership/ex3/content/st...,1
19805,/home/prog3/notebooks/intership/ex3/content/st...,1
19806,/home/prog3/notebooks/intership/ex3/content/st...,1


## Вывод

...

# Блок для проверки

Поместите сюда весь необходимый код для тестирования вашей модели на новых данных. Убедитесь что

- Импортируются все библиотеки и классы
- Подгружабтся веса с внешних ресурсов
- Происходит рассчет метрик
...

In [None]:
# path_to_test = "/content/test"

# Put your code here