In [35]:
# Standard library imports
import os
import time
from pathlib import Path, PurePath

# Third-party library imports
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
import librosa
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.io import gfile
from scipy.fftpack import dct
from scipy.spatial.distance import cosine, euclidean
import moviepy.video.io.VideoFileClip as mp
from sklearn.metrics.pairwise import cosine_similarity

# PyTorch imports
import torch
import torch.nn.functional as F
import torchvision.transforms as transforms

# Segmentation Models PyTorch import
import segmentation_models_pytorch as smp

In [32]:
test_video = r'C:\Users\Vladimir\PycharmProjects\ML\Kaggle\yappy\test_dataset_test_data_yappy\test_data_yappy\test_dataset\49577a11-51b9-490a-b1f0-df17335219de.mp4'
test_video2 = r'C:\Users\Vladimir\PycharmProjects\ML\Kaggle\yappy\test_dataset_test_data_yappy\test_data_yappy\test_dataset\da9783ba-ceac-47ed-9d8f-30b614e938dd.mp4'
print(test_video)
print(test_video2)

C:\Users\Vladimir\PycharmProjects\ML\Kaggle\yappy\test_dataset_test_data_yappy\test_data_yappy\test_dataset\49577a11-51b9-490a-b1f0-df17335219de.mp4
C:\Users\Vladimir\PycharmProjects\ML\Kaggle\yappy\test_dataset_test_data_yappy\test_data_yappy\test_dataset\da9783ba-ceac-47ed-9d8f-30b614e938dd.mp4


In [46]:
class ResNetEncoder():
    def __init__(self):
        self.model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

    # 1. Извлечение кадров из видео
    def extract_frames(self, video_path, max_frames=10):
        cap = cv2.VideoCapture(video_path)
        frames = []
        success, frame = cap.read()
        count = 0
        while success and count < max_frames:
            frames.append(frame)
            success, frame = cap.read()
            count += 1
        cap.release()
        return frames

    # 2. Извлечение гистограммы цветов
    def extract_color_histogram(self, frame):
        hsv_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
        hist = cv2.calcHist([hsv_frame], [0, 1, 2], None, [50, 60, 70], [0, 180, 0, 256, 0, 256])
        hist = cv2.normalize(hist, hist).flatten()
        return hist

    # 3. Извлечение признаков с помощью предобученной модели ResNet50
    def extract_deep_features(self, frame):
        frame_resized = cv2.resize(frame, (224, 224))
        frame_preprocessed = preprocess_input(np.expand_dims(frame_resized, axis=0))
        features = self.model.predict(frame_preprocessed, verbose=0)
        return features.flatten()

    # 4. Комбинирование всех признаков (гистограмма цветов, ResNet, ORB)
    def extract_combined_features(self, frame):
        color_hist = self.extract_color_histogram(frame)
        deep_features = self.extract_deep_features(frame)

        # Объединение всех признаков в один вектор
        combined_features = np.hstack((color_hist, deep_features))
        return combined_features

    # 5. Извлечение кадров из видео и получение средних признаков
    def encode(self, video_path, max_frames=10):
        frames = self.extract_frames(video_path, max_frames)
        combined_features_list = [self.extract_combined_features(frame) for frame in frames]

        # Усредняем признаки по кадрам
        average_features = np.mean(combined_features_list, axis=0)
        return average_features

    def get_distance(self, emb1, emb2):
        similarity = 1 - cosine(emb1, emb2)
        return similarity

In [47]:
ResNetEnc = ResNetEncoder()

In [48]:
emb1 = ResNetEnc.encode(test_video)
emb2 = ResNetEnc.encode(test_video2)
ResNetEnc.get_distance(emb1, emb2)

0.9795281887054443

In [72]:
class InceptionEncoder():
    def __init__(self):
        self.model = tf.keras.applications.InceptionV3(weights='imagenet', include_top=False, pooling='avg')

    def frames_extract(self, video_file, start_off=0, sampling_fps=1, max_frame_cnt=60):
        """Extracts frames from input video."""
        if not os.path.exists(video_file):
            return None

        vcap = cv2.VideoCapture(video_file)
        fps = vcap.get(cv2.CAP_PROP_FPS)
        frame_interval = int(fps / sampling_fps) if sampling_fps > 0 and sampling_fps < fps else 1
        frame_list, cnt = [], 0

        success, im = vcap.read()
        while success and cnt < max_frame_cnt:
            if cnt % frame_interval == 0:
                frame_list.append(im)
            cnt += 1
            success, im = vcap.read()

        return np.array(frame_list) if frame_list else None

    def feature_from_single_image_file(self, image_file):
        """Extract feature vector from a single image file."""
        if not gfile.exists(image_file):
            print(f"File does not exist: {image_file}")
            return None

        image_data = gfile.GFile(image_file, 'rb').read()
        image_tensor = tf.image.decode_jpeg(image_data, channels=3)
        image_tensor = tf.image.resize(image_tensor, (299, 299))  # Inception-v3 input size

        image_tensor = tf.expand_dims(image_tensor, axis=0)
        image_tensor = image_tensor / 255.0

        print(self.model.signatures['serving_default'](image_tensor))
        feature_tensor = self.model.signatures['serving_default'](image_tensor)['pool_3']
        return np.squeeze(feature_tensor.numpy())

    def feature_from_single_video_file(self, video_file, start_off=0, sampling_fps=1, max_frame_cnt=60, padding=True):
        """Extract feature vectors from a video file."""
        if not gfile.exists(video_file):
            print(f"File does not exist: {video_file}")
            return None

        frames = self.frames_extract(video_file, start_off, sampling_fps, max_frame_cnt)
        if frames is None:
            return None

        features = []
        for frame in frames:
            frame_tensor = tf.convert_to_tensor(frame, dtype=tf.uint8)
            frame_tensor = tf.image.resize(frame_tensor, (299, 299))  # Inception-v3 input size
            frame_tensor = tf.expand_dims(frame_tensor, axis=0)
            frame_tensor = frame_tensor / 255.0

            # Получаем эмбеддинги напрямую через вызов модели
            feature_tensor = self.model(frame_tensor)
            features.append(np.squeeze(feature_tensor.numpy()))

        # Padding if necessary
        if padding and max_frame_cnt > len(features):
            zero_feat = np.zeros([2048], dtype=np.float32)
            features.extend([zero_feat] * (max_frame_cnt - len(features)))

        return np.array(features)

    def encode(self, video_path):
        video_features = self.feature_from_single_video_file(video_path, sampling_fps=1, max_frame_cnt=60, padding=True)
        mean_features = np.mean(video_features, axis=0)
        return mean_features

    def get_distance(self, emb1, emb2):
        similarity = cosine_similarity([emb1], [emb2])[0][0]
        return similarity

    def get_evc_distance(self, emb1, emb2):
        similarity = euclidean(emb1, emb2)
        return similarity

In [73]:
InceptionEnc = InceptionEncoder()
InceptionEnc.encode(test_video)

array([0.00658599, 0.00709436, 0.00497256, ..., 0.01663911, 0.01624956,
       0.00550472], dtype=float32)

In [74]:
emb1 = InceptionEnc.encode(test_video)
emb2 = InceptionEnc.encode(test_video2)
InceptionEnc.get_evc_distance(emb1, emb2)

0.18608391284942627

In [57]:
class AudioEncoder():
    def encode(self, video_path):
        video = mp.VideoFileClip(video_path)
        if video.audio is None:
            print(video_path)
            return np.zeros((128,))
        video.audio.write_audiofile("audio1.wav")
        y1, sr1 = librosa.load("audio1.wav", sr=None)
        S1 = librosa.feature.melspectrogram(y=y1, sr=sr1)
        mean_spectrum1 = np.mean(S1, axis=1)
        return mean_spectrum1

    def get_distance(self, spect1, spect2):
        similarity = 1 - cosine(spect1, spect2)
        return similarity

In [59]:
AudioEnc = AudioEncoder()
emb1 = AudioEnc.encode(test_video)
emb2 = AudioEnc.encode(test_video2)
print(emb1.shape)
AudioEnc.get_distance(emb1, emb2)

MoviePy - Writing audio in audio1.wav


                                                        

MoviePy - Done.




MoviePy - Writing audio in audio1.wav


                                                        

MoviePy - Done.
(128,)




0.7156458497047424

In [29]:
class UnetEncoder():
    def __init__(self):
        self.transform = transforms.Compose([transforms.Resize((256, 256))])
        unet_model = smp.Unet(
            encoder_name="resnet34",
            encoder_weights="imagenet",
            in_channels=3,
            classes=1
        )
        self.encoder = unet_model.encoder

    def frames_extract(self, video_file, sampling_fps=1, max_frame_cnt=60):
        """Извлекает кадры из видео с использованием OpenCV."""
        vcap = cv2.VideoCapture(video_file)
        fps = vcap.get(cv2.CAP_PROP_FPS)
        frame_interval = int(fps / sampling_fps)
        frames = []

        success, frame = vcap.read()
        count = 0

        while success and count < max_frame_cnt:
            if count % frame_interval == 0:
                frames.append(frame)
            success, frame = vcap.read()
            count += 1

        vcap.release()
        return frames

    def encode(self, video_path):
        frames = self.frames_extract(video_path)
        features = []

        for frame in frames:
            frame_tensor = self.transform(torch.Tensor(frame).permute(2, 0, 1)).unsqueeze(0)
            with torch.no_grad():
                feature = self.encoder(frame_tensor)[-1]  # Берем последний слой энкодера
            features.append(feature.squeeze(0))

        # Усредняем признаки по всем кадрам
        return torch.mean(torch.stack(features), dim=0)

    def get_distance(self, feature1, feature2):
        cosine_sim = F.cosine_similarity(feature1.view(1, -1), feature2.view(1, -1))
        return 1 - cosine_sim.item()

In [30]:
UnetEnc = UnetEncoder()
emb1 = UnetEnc.encode(test_video)
emb2 = UnetEnc.encode(test_video2)
UnetEnc.get_distance(emb1, emb2)

Downloading: "https://download.pytorch.org/models/resnet34-333f7ec4.pth" to C:\Users\Vladimir/.cache\torch\hub\checkpoints\resnet34-333f7ec4.pth
100%|██████████| 83.3M/83.3M [00:18<00:00, 4.72MB/s]


0.2974700927734375

In [67]:
class LengthEncoder:
    def encode(self, video_file):
        cap = cv2.VideoCapture(video_file)
        fps = cap.get(cv2.CAP_PROP_FPS)  # Количество кадров в секунду
        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))  # Общее количество кадров
        cap.release()
        if fps > 0:  # Если fps корректный, считаем длину
            video_length = frame_count / fps  # Длина видео в секундах
        else:
            video_length = 0  # Если fps не найден, длина видео = 0
        return video_length

    # Функция для расчета длины видео и их соотношения
    def get_distance(self, length_1, length_2):
        length_diff = abs(length_1 - length_2)
        max_length = max(length_1, length_2)

        # Избегаем деления на 0
        if max_length > 0:
            length_ratio = length_diff / max_length
        else:
            length_ratio = 0

        return length_ratio

In [68]:
LengthEnc = LengthEncoder()
emb1 = LengthEnc.encode(test_video)
emb2 = LengthEnc.encode(test_video2)
LengthEnc.get_distance(emb1, emb2)

0.0

In [33]:
test = pd.read_csv('test_dataset_test_data_yappy/test_data_yappy/test.csv')
# test = pd.read_csv('test_clean.csv')
test.head()

Unnamed: 0,created,uuid,link
0,2024-06-01 01:37:57,49577a11-51b9-490a-b1f0-df17335219de,https://s3.ritm.media/yappy-db-duplicates/4957...
1,2024-06-01 04:42:10,4e1f7fad-5008-4216-9849-550a00f1e35f,https://s3.ritm.media/yappy-db-duplicates/4e1f...
2,2024-06-01 08:44:48,337fdbe6-2bc7-4bc7-931e-d94ada927ede,https://s3.ritm.media/yappy-db-duplicates/337f...
3,2024-06-01 10:11:48,35138a88-0249-405e-91b4-8a36b1e2e730,https://s3.ritm.media/yappy-db-duplicates/3513...
4,2024-06-01 12:23:29,322f4312-3d46-401b-8cd9-80a0d06347ed,https://s3.ritm.media/yappy-db-duplicates/322f...


In [36]:
path_to_test_data = r'C:\Users\Vladimir\PycharmProjects\ML\Kaggle\yappy\test_dataset_test_data_yappy\test_data_yappy\test_dataset'

test['video_path'] = test['uuid'].apply(lambda uuid: f"{path_to_test_data}\\{uuid}.mp4")
tqdm.pandas()
test['embeddings'] = test['video_path'].progress_apply(InceptionEnc.encode)

100%|██████████| 1000/1000 [10:55<00:00,  1.53it/s]


In [101]:
n = test.shape[0]
test['is_duplicate'] = [False] * n
test['duplicate_for'] = [np.nan] * n

In [43]:
test.head()

Unnamed: 0,created,uuid,link,video_path,embeddings,is_duplicate,duplicate_for
0,2024-06-01 01:37:57,49577a11-51b9-490a-b1f0-df17335219de,https://s3.ritm.media/yappy-db-duplicates/4957...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.0065859933, 0.0070943553, 0.0049725627, 0.0...",False,
1,2024-06-01 04:42:10,4e1f7fad-5008-4216-9849-550a00f1e35f,https://s3.ritm.media/yappy-db-duplicates/4e1f...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.013478812, 0.01082532, 0.027198287, 0.00505...",False,
2,2024-06-01 08:44:48,337fdbe6-2bc7-4bc7-931e-d94ada927ede,https://s3.ritm.media/yappy-db-duplicates/337f...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.003774149, 0.004182019, 0.0034305027, 0.014...",False,
3,2024-06-01 10:11:48,35138a88-0249-405e-91b4-8a36b1e2e730,https://s3.ritm.media/yappy-db-duplicates/3513...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.0074166446, 0.010031568, 0.02124418, 0.0043...",False,
4,2024-06-01 12:23:29,322f4312-3d46-401b-8cd9-80a0d06347ed,https://s3.ritm.media/yappy-db-duplicates/322f...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.0141136665, 0.03179764, 0.034023765, 0.0206...",False,


In [None]:
ResNetEnc

In [50]:
test['UnetEnc_embeddings'] = test['video_path'].progress_apply(UnetEnc.encode)

100%|██████████| 1000/1000 [03:41<00:00,  4.51it/s]


In [None]:
test['AudioEnc_embeddings'] = test['video_path'].apply(AudioEnc.encode)

In [61]:
test['length'] = test['video_path'].progress_apply(LengthEnc.encode)

100%|██████████| 1000/1000 [00:09<00:00, 100.31it/s]


In [63]:
test.to_csv('test_embeddings.csv', index=False)

In [64]:
from sklearn.neighbors import NearestNeighbors

embeddings_matrix = np.stack(test['embeddings'].values)

knn = NearestNeighbors(n_neighbors=5, metric='cosine')  # Можно выбрать метрику (например, 'euclidean')
knn.fit(embeddings_matrix)

In [76]:
import pickle

filename = 'models/linear_model.pickle'

with open(filename, 'rb') as file:
    lr = pickle.load(file)

In [102]:
test['created'] = pd.to_datetime(test['created'])

In [104]:
i = 0
for index, row1 in test.iterrows():
    if index < 5: continue

    distances, indices = knn.kneighbors([row1['embeddings']])
    # Получение информации о самых похожих видео
    nearest_videos = test.iloc[indices[0]]
    uuids = nearest_videos['uuid']
    nearest_distances = distances[0]

    cur_metrics = nearest_videos[['created', 'uuid', 'embeddings', 'AudioEnc_embeddings', 'UnetEnc_embeddings', 'length']]

    first_row = cur_metrics.iloc[0]
    
    # Создаем список для хранения новых строк
    new_rows = []

    # Проходим по всем строкам, начиная со второй
    for _, row in cur_metrics.iloc[1:].iterrows():
        new_row = {
            'created': row['created'],
            'uuid': row['uuid'],
            'Inception_cosin': InceptionEnc.get_distance(first_row['embeddings'], row['embeddings']),
            'Inception_eucld': InceptionEnc.get_evc_distance(first_row['embeddings'], row['embeddings']),
            'audio': AudioEnc.get_distance(first_row['AudioEnc_embeddings'], row['AudioEnc_embeddings']),
            'UNet': UnetEnc.get_distance(first_row['UnetEnc_embeddings'], row['UnetEnc_embeddings']),
            'length_ratio': LengthEnc.get_distance(first_row['length'], row['length'])
        }
        new_rows.append(new_row)

    # Создаем новый DataFrame из списка новых строк
    new_df = pd.DataFrame(new_rows)
    preds = lr.predict(new_df.iloc[:, 2:])
    preds_proba = lr.predict_proba(new_df.iloc[:, 2:])[:, 1]
    new_df['is_dub'] = preds
    new_df['is_dub_proba'] = preds_proba
    # print(new_df)
    
    dubs = new_df[new_df['is_dub'] == 1].sort_values('is_dub_proba', ascending=False)
    
    for i, r in dubs.iterrows():
        if r['created'] > row1['created']:
            test.loc[test['uuid'] == r['uuid'], 'is_duplicate'] = True
            test.loc[test['uuid'] == r['uuid'], 'duplicate_for'] = row1['uuid']
                    
    if index % 50 == 0:
        print(index)

50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950


In [105]:
test.head()

Unnamed: 0,created,uuid,link,video_path,embeddings,is_duplicate,duplicate_for,UnetEnc_embeddings,AudioEnc_embeddings,length
0,2024-06-01 01:37:57,49577a11-51b9-490a-b1f0-df17335219de,https://s3.ritm.media/yappy-db-duplicates/4957...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.0065859933, 0.0070943553, 0.0049725627, 0.0...",False,,"[[[tensor(0.), tensor(0.), tensor(0.), tensor(...","[0.00041812917, 0.000780223, 0.0034573867, 0.0...",16.0
1,2024-06-01 04:42:10,4e1f7fad-5008-4216-9849-550a00f1e35f,https://s3.ritm.media/yappy-db-duplicates/4e1f...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.013478812, 0.01082532, 0.027198287, 0.00505...",False,,"[[[tensor(0.), tensor(0.), tensor(0.2252), ten...","[9.353134, 19.390526, 7.0610957, 5.4561687, 4....",56.25
2,2024-06-01 08:44:48,337fdbe6-2bc7-4bc7-931e-d94ada927ede,https://s3.ritm.media/yappy-db-duplicates/337f...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.003774149, 0.004182019, 0.0034305027, 0.014...",False,,"[[[tensor(0.), tensor(0.), tensor(0.), tensor(...","[6.0818267, 48.76453, 40.668804, 23.965372, 37...",18.966667
3,2024-06-01 10:11:48,35138a88-0249-405e-91b4-8a36b1e2e730,https://s3.ritm.media/yappy-db-duplicates/3513...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.0074166446, 0.010031568, 0.02124418, 0.0043...",False,,"[[[tensor(0.), tensor(0.), tensor(0.2406), ten...","[110.40445, 477.3407, 260.7453, 82.90226, 48.8...",7.833333
4,2024-06-01 12:23:29,322f4312-3d46-401b-8cd9-80a0d06347ed,https://s3.ritm.media/yappy-db-duplicates/322f...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.0141136665, 0.03179764, 0.034023765, 0.0206...",False,,"[[[tensor(0.), tensor(0.0939), tensor(0.0783),...","[90.45213, 65.43523, 21.771788, 19.703815, 23....",23.666667


In [106]:
test[test['is_duplicate']]

Unnamed: 0,created,uuid,link,video_path,embeddings,is_duplicate,duplicate_for,UnetEnc_embeddings,AudioEnc_embeddings,length
42,2024-06-05 01:53:00,fd52cbe5-012f-4451-abad-1c86c8279e8c,https://s3.ritm.media/yappy-db-duplicates/fd52...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.0040277336, 0.013641495, 0.007854927, 0.025...",True,4b005e08-6890-4c4b-b0cf-95dc3e1aabeb,"[[[tensor(0.), tensor(0.0393), tensor(0.2136),...","[0.020753913, 0.031563073, 0.039239094, 0.8021...",25.000000
50,2024-06-05 19:13:01,6f6f5da8-f997-491d-8fe0-64d262b2ee4c,https://s3.ritm.media/yappy-db-duplicates/6f6f...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.028689208, 0.0125948135, 0.005007297, 0.003...",True,22ee80a3-d9ef-48d4-83d7-9c97cc7030c2,"[[[tensor(0.), tensor(0.), tensor(1.4099), ten...","[11.986315, 42.080883, 21.232847, 14.662971, 2...",9.500000
60,2024-06-06 10:04:01,cbf3948e-e8a0-4e61-858d-72b7ff2d2d43,https://s3.ritm.media/yappy-db-duplicates/cbf3...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.0054139197, 0.009174943, 0.015418697, 0.002...",True,045265e5-0d4c-4372-b960-3087f685eb97,"[[[tensor(0.), tensor(0.), tensor(0.), tensor(...","[0.48990253, 5.3505015, 36.0851, 57.74213, 24....",13.833333
63,2024-06-06 13:46:37,89fca4ee-4678-482b-8d6d-907dbc057151,https://s3.ritm.media/yappy-db-duplicates/89fc...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.00235023, 0.0231305, 0.005213861, 0.0035420...",True,2cfd6af3-7df6-4afa-8c3b-c17236c83c03,"[[[tensor(0.), tensor(0.), tensor(0.1531), ten...","[419.2322, 527.33276, 226.11122, 109.44811, 11...",12.300000
79,2024-06-07 22:14:03,51b077bc-fac3-4cdb-bda4-06cd60d53af7,https://s3.ritm.media/yappy-db-duplicates/51b0...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.0065238466, 0.002718225, 0.0021864201, 0.01...",True,0d62849b-d3b9-47af-a285-2f638bc9ac13,"[[[tensor(0.6974), tensor(1.7682), tensor(1.68...","[0.66644806, 1.9188691, 0.5836652, 0.13023834,...",50.433333
...,...,...,...,...,...,...,...,...,...,...
993,2024-09-07 06:04:07,7e67cde3-d548-4e7b-b3e1-90c83a4632ed,https://s3.ritm.media/yappy-db-duplicates/7e67...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.008075179, 0.016512446, 0.035428133, 0.0115...",True,4b06ed60-3d36-4537-b351-be9dd28755b7,"[[[tensor(1.1528), tensor(1.6799), tensor(1.23...","[21.94578, 72.97068, 41.614143, 19.956268, 14....",5.160000
995,2024-09-07 17:10:54,933149f9-e660-4377-95c8-f8dd329db24e,https://s3.ritm.media/yappy-db-duplicates/9331...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.015765151, 0.007593849, 0.03509754, 0.01167...",True,06e0d485-d4e1-4ba7-8eb2-e2f1fcfca416,"[[[tensor(0.), tensor(0.), tensor(0.), tensor(...","[157.58481, 179.05833, 24.721344, 20.241692, 1...",7.533333
996,2024-09-08 05:18:24,9f707190-3b32-48bf-a5f4-ceec6eedb847,https://s3.ritm.media/yappy-db-duplicates/9f70...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.042089663, 0.0043040505, 0.023805328, 0.012...",True,56cfa0d5-a050-4443-9a80-1403bcad6839,"[[[tensor(0.0161), tensor(0.), tensor(0.), ten...","[42.9752, 88.70443, 26.27396, 23.015589, 28.37...",27.727728
998,2024-09-11 04:42:01,ef7e175e-2391-45a0-b69f-33837668bb79,https://s3.ritm.media/yappy-db-duplicates/ef7e...,C:\Users\Vladimir\PycharmProjects\ML\Kaggle\ya...,"[0.006182716, 0.019054323, 0.032745354, 0.0131...",True,0574335c-9884-40de-a514-97b3fd3d72df,"[[[tensor(0.), tensor(0.), tensor(0.), tensor(...","[113.20377, 311.62283, 177.75804, 70.84638, 29...",19.500000


In [107]:
sub = test[['created', 'uuid', 'link', 'is_duplicate', 'duplicate_for']]
sub.head()

Unnamed: 0,created,uuid,link,is_duplicate,duplicate_for
0,2024-06-01 01:37:57,49577a11-51b9-490a-b1f0-df17335219de,https://s3.ritm.media/yappy-db-duplicates/4957...,False,
1,2024-06-01 04:42:10,4e1f7fad-5008-4216-9849-550a00f1e35f,https://s3.ritm.media/yappy-db-duplicates/4e1f...,False,
2,2024-06-01 08:44:48,337fdbe6-2bc7-4bc7-931e-d94ada927ede,https://s3.ritm.media/yappy-db-duplicates/337f...,False,
3,2024-06-01 10:11:48,35138a88-0249-405e-91b4-8a36b1e2e730,https://s3.ritm.media/yappy-db-duplicates/3513...,False,
4,2024-06-01 12:23:29,322f4312-3d46-401b-8cd9-80a0d06347ed,https://s3.ritm.media/yappy-db-duplicates/322f...,False,


In [108]:
sub.to_csv('submission.csv', index=False)