# Подготовка датасета

In [2]:
import pandas as pd
from pathlib import Path
from PIL import Image
from sklearn.model_selection import train_test_split
import shutil

## Скачиваем архив с изображениями

In [1]:
!wget -c https://forecasting.iszf.irk.ru/data/12000/all.zip

--2025-06-24 15:13:44--  https://forecasting.iszf.irk.ru/data/12000/all.zip
Resolving forecasting.iszf.irk.ru (forecasting.iszf.irk.ru)... 84.237.21.36
Connecting to forecasting.iszf.irk.ru (forecasting.iszf.irk.ru)|84.237.21.36|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2094376138 (2.0G) [application/zip]
Saving to: ‘all.zip’


2025-06-24 15:17:38 (4.46 MB/s) - Connection closed at byte 1082134933. Retrying.

--2025-06-24 15:17:39--  (try: 2)  https://forecasting.iszf.irk.ru/data/12000/all.zip
Connecting to forecasting.iszf.irk.ru (forecasting.iszf.irk.ru)|84.237.21.36|:443... connected.
HTTP request sent, awaiting response... 206 Partial Content
Length: 2094376138 (2.0G), 1012241205 (965M) remaining [application/zip]
Saving to: ‘all.zip’


2025-06-24 15:20:26 (5.81 MB/s) - ‘all.zip’ saved [2094376138/2094376138]



In [3]:
!mkdir -p data/12000Mhz
!unzip -qqd data/12000Mhz all.zip

## Работа с dataset_12000.csv, где находится информация о изображениях

In [6]:
data = pd.read_csv('dataset_12000.csv')

def fix_name(filename: str) -> str:
    return filename.replace(':', '-') + '.png'

data['Name'] = data['Name'].apply(fix_name)

In [7]:
data.head()

Unnamed: 0,Name,Date,Time,URL,Category,Probability,Color
0,srh_20240410T003117_12200_I.fit.png,2024-04-10,--:--:--,https://ftp.rao.istp.ac.ru/SRH/SRH1224/cleanMa...,Ok,1,red
1,srh_20240410T022837_12200_I.fit.png,2024-04-10,--:--:--,https://ftp.rao.istp.ac.ru/SRH/SRH1224/cleanMa...,Ok,1,red
2,srh_20240410T034342_12200_I.fit.png,2024-04-10,--:--:--,https://ftp.rao.istp.ac.ru/SRH/SRH1224/cleanMa...,Ok,1,red
3,srh_20240410T041302_12200_I.fit.png,2024-04-10,--:--:--,https://ftp.rao.istp.ac.ru/SRH/SRH1224/cleanMa...,Bad,1,green
4,srh_20240410T051253_12200_I.fit.png,2024-04-10,--:--:--,https://ftp.rao.istp.ac.ru/SRH/SRH1224/cleanMa...,Ok,1,red


## Проверка на поврежденные изображения

In [8]:
data_dir = Path('data/12000Mhz')

def is_image_corrupted(filename: str) -> bool:
    try:
        with Image.open(data_dir / filename) as img:
            img.verify()
        return False
    except Exception:
        return True

data['Corrupted'] = data['Name'].apply(is_image_corrupted)

corrupted = data[data['Corrupted']]
clean = data[~data['Corrupted']]

## Подсчет поврежденных изображений по категориям

In [10]:
num_bad = corrupted[corrupted['Category'] == 'Bad'].shape[0]
num_ok = corrupted[corrupted['Category'] == 'Ok'].shape[0]
print(f'Повреждено {num_bad} файлов Bad и {num_ok} файлов Ok')

Повреждено 376 файлов Bad и 6 файлов Ok


## Балансировка классов

In [12]:
ok_images = clean[clean['Category'] == 'Ok']['Name'].tolist()
bad_images = clean[clean['Category'] == 'Bad']['Name'].tolist()

class_size = min(len(ok_images), len(bad_images))
print(f'Размеры классов Bed и Ok - {class_size}')

X = ok_images[:class_size] + bad_images[:class_size]
Y = ['Ok'] * class_size + ['Bad'] * class_size

Размеры классов Bed и Ok - 3003


## Разделение на train/val/test

In [13]:
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.4, stratify=Y)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, stratify=Y_temp)

splits = {
    'train': (X_train, Y_train),
    'val': (X_val, Y_val),
    'test': (X_test, Y_test)
}

## Распределение изображений по папкам

In [14]:
for split, (x_list, y_list) in splits.items():
    for cls in ['Ok', 'Bad']:
        (data_dir / split / cls).mkdir(parents=True, exist_ok=True)

    for name, label in zip(x_list, y_list):
        src = data_dir / name
        dst = data_dir / split / label / name
        if src.exists():
            shutil.move(str(src), str(dst))

## Удаление оставшихся файлов

In [15]:
for file in data_dir.iterdir():
    if file.is_file():
        file.unlink()

# Обучение модели

In [12]:
# Перейдём в директорию с проектом
%cd /content/srh-classifier/

/content/srh-classifier


In [13]:
!python example.py

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100% 20.5M/20.5M [00:00<00:00, 225MB/s]
Epoch 1/50
----------
train: 100% 113/113 [00:38<00:00,  2.96it/s]
train Loss: 0.5941 Acc: 0.6761
val: 100% 38/38 [00:13<00:00,  2.92it/s]
val Loss: 0.8731 Acc: 0.4729
Epoch 2/50
----------
train: 100% 113/113 [00:36<00:00,  3.08it/s]
train Loss: 0.5563 Acc: 0.7002
val: 100% 38/38 [00:12<00:00,  2.93it/s]
val Loss: 0.8854 Acc: 0.4246
Epoch 3/50
----------
train: 100% 113/113 [00:36<00:00,  3.06it/s]
train Loss: 0.5631 Acc: 0.6983
val: 100% 38/38 [00:12<00:00,  2.94it/s]
val Loss: 0.8888 Acc: 0.4238
Epoch 4/50
----------
train: 100% 113/113 [00:36<00:00,  3.08it/s]
train Loss: 0.5464 Acc: 0.7019
val: 100% 38/38 [00:12<00:00,  2.94it/s]
val Loss: 0.9487 Acc: 0.5071
Epoch 5/50
----------
train: 100% 113/113 [00:35<00:00,  3.14it/s]
train Loss: 0.5414 Acc: 0.7172
val: 100% 38/38 [00:13<

In [16]:
!python example.py

Evaluating: 100% 38/38 [00:13<00:00,  2.91it/s]
Figure(1000x1000)


In [17]:
!python example.py

[0;39mDownloading https://ftp.rao.istp.ac.ru/SRH/SRH0306/cleanMaps/20210712/srh_I_2021-07-12T01:59:42_3100.fit[0m
