### Алгоритм создания MVP-проекта   
#### 1 часть: подготовка к работе   
1.1. Установим необходимые библиотеки

In [1]:
%%bash
pip install numpy scipy scikit-image matplotlib



In [2]:
pip install pyTelegramBotAPI



In [3]:
pip install SpeechRecognition



In [4]:
pip install pydub



In [5]:
pip install pydub opencv-python-headless opuslib




1.2. Скачиваем выбранную CCS-модель

In [6]:
import torch
precision = 'fp32'
ssd_model = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd', model_math=precision)
utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_ssd_processing_utils')

Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
Using cache found in /root/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub


In [7]:
ssd_model.to('cuda')
ssd_model.eval()

SSD300(
  (feature_extractor): ResNet(
    (feature_extractor): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (4): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplac

1.3. Монтируем Google-disk, на котором будем хранить вспомогательные материалы

In [8]:
from google.colab import drive
drive.mount('/content/drive')
torch.save(ssd_model.state_dict(), 'ssd_model.pth')
#torch.save(utils, 'ssd_utils.pth')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


1.4. Подгружаем необходимые библиотеки

In [9]:
import os
import time
import json
import webbrowser
from tqdm import tqdm
import subprocess
from pydub import AudioSegment
import telebot
import cv2
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import speech_recognition as sr
import torchvision.transforms as T


#### 2 часть: реализация логики, которая будет обрабатывать данные, полученные от модели

In [22]:
def filter_foto(uri):
    inputs = [utils.prepare_input(uri)]
    tensor = utils.prepare_tensor(inputs, precision == 'fp16')
    with torch.no_grad():
        detections_batch = ssd_model(tensor)
    results_per_input = utils.decode_results(detections_batch)
    best_results_per_input = [utils.pick_best(results, 0.20) for results in results_per_input]
    classes_to_labels = utils.get_coco_object_dictionary()

    for image_idx in range(len(best_results_per_input)):
        fig, ax = plt.subplots(1)
        image = inputs[image_idx] / 2 + 0.5
        ax.imshow(image)
        bboxes, classes, confidences = best_results_per_input[image_idx]
        for idx in range(len(bboxes)):
            left, bot, right, top = bboxes[idx]
            x, y, w, h = [val * 300 for val in [left, bot, right - left, top - bot]]
            rect = patches.Rectangle((x, y), w, h, linewidth=1, edgecolor='r', facecolor='none')
            ax.add_patch(rect)
            ax.text(x, y, "{} {:.0f}%".format(classes_to_labels[classes[idx] - 1], confidences[idx]*100), bbox=dict(facecolor='white', alpha=0.5))

    fig.canvas.draw()
    processed_image = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
    processed_image = processed_image.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    ax.clear()
    plt.close(fig)

    return processed_image



In [23]:
def process_video(input_video_path, output_video_path):
    cap = cv2.VideoCapture(input_video_path)
    frame_count = 0
    frames = []

    os.makedirs(output_video_path, exist_ok=True)

    # Reuse figure and axes
    fig, ax = plt.subplots(1)

    for frame_num in tqdm(range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT)))):
        ret, frame = cap.read()

        if not ret:
            break

        frame_filename = os.path.join(output_video_path, f"frame_{frame_count}.jpg")
        output_filename = os.path.join(output_video_path, f"output_frame_{frame_count}.jpg")

        cv2.imwrite(frame_filename, frame)
        processed_image = filter_foto(frame_filename)

        if processed_image is not None:
            frames.append(processed_image)

        frame_count += 1

    cap.release()

    height, width, layers = frames[0].shape
    video = cv2.VideoWriter(output_video_path + "/video.mp4", cv2.VideoWriter_fourcc(*"mp4v"), 1, (width, height))

    for frame in frames:
        video.write(frame)

    cv2.destroyAllWindows()
    video.release()

    return frame_count

In [None]:
input_video_path = "/content/drive/MyDrive/Colab Notebooks/dl_project/babycats.mp4"
output_video_path = "/content/drive/MyDrive/Colab Notebooks/dl_project/output_frames"
total_frames = process_video(input_video_path, output_video_path)

print(f"Total frames processed: {total_frames}")

  0%|          | 0/10739 [00:00<?, ?it/s]

Downloading COCO annotations.
Downloading finished.


 30%|██▉       | 3193/10739 [24:49<58:40,  2.14it/s]    


KeyboardInterrupt: ignored

In [17]:
uri = "/content/drive/MyDrive/Colab Notebooks/dl_project/pic.jpg"
output_uri = "/content/drive/MyDrive/Colab Notebooks/dl_project/filtered_image.jpg"
filter_foto(uri, ssd_model)

RuntimeError: ignored