In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
file_path = '/content/drive/MyDrive/Datasets/dataset_final.xlsx'
df = pd.read_excel(file_path)

In [None]:
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
def prepare_embeddings(df):
    texts = pd.concat([df['text1'], df['text2']]).unique().tolist()

    embeddings = model.encode(texts, batch_size=512, show_progress_bar=True)

    return {text: emb for text, emb in zip(texts, embeddings)}

In [None]:
def calculate_labels(df, threshold=0.8):
    text_embeddings = prepare_embeddings(df)

    labels = []
    similarities = []

    for _, row in df.iterrows():
        emb1 = text_embeddings[row['text1']]
        emb2 = text_embeddings[row['text2']]

        sim = cosine_similarity([emb1], [emb2])[0][0]
        similarities.append(sim)

        labels.append(1 if sim > threshold else 0)

    df['similarity'] = similarities
    df['label'] = labels
    return df

In [None]:
THRESHOLD = 0.8
final_df = calculate_labels(final_df, threshold=THRESHOLD)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(final_df['similarity'], bins=50, alpha=0.7)
plt.axvline(THRESHOLD, color='red', linestyle='--', label=f'Порог ({THRESHOLD})')
plt.title('Распределение косинусного сходства')
plt.xlabel('Сходство')
plt.ylabel('Количество пар')
plt.legend()
plt.show()

In [None]:
output_path = '/content/drive/MyDrive/Datasets/dataset_labeled.csv'
final_df.drop('similarity', axis=1).to_csv(output_path, index=False, encoding='utf-8')

In [None]:
print(f'''
Результаты обработки:
- Всего пар: {len(final_df)}
- Дубликатов (label=1): {final_df["label"].sum()} ({final_df["label"].mean():.1%})
- Примеры записей:
{final_df.sample(3)[['id1', 'id2', 'text1', 'label']]}
''')