# Формирование датасета пар расхождений

In [2]:
import pandas as pd
import requests
from tqdm import tqdm
import numpy as np
import uuid
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv('data/india_clf_6.csv')[['Id', 'Body', 'Target']].set_index('Id')
data = data.rename(columns={
    'Id': 'id',
    'Body': 'description',
    'Target': 'class'
})
data['subject'] = 'no subject'
data

Unnamed: 0_level_0,description,class,subject
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
67850,Gmail otp not recieved,Application Related | Email OTP not received,no subject
67854,Otp not received in my gmail,Application Related | Email OTP not received,no subject
68090,Otp not send,Application Related | Email OTP not received,no subject
68093,Otp not recieved my gmail,Application Related | Email OTP not received,no subject
68098,Why not receiveing otp from gmai,Application Related | Email OTP not received,no subject
...,...,...,...
75138,Withdraw,EMI Payment Related | Withdrawal issue,no subject
79158,Withdrawl amount is reflecting 0,EMI Payment Related | General query,no subject
79339,Y error how to apply,Others | General query,no subject
72395,Y it has been rejected,Rejected | Policy Reject,no subject


Это датасет очищенный от Incomplete query

### Предобработка

In [4]:
# Чистка от путсых таргетов
data = data[~data['class'].isna()]

# Топ самых встречаемых классов (> 30 таргетов)
vc = data["class"].value_counts()
rare_classes = vc[vc < 30].index
data.loc[data["class"].isin(rare_classes), "class"] = "Others"

# Разбиение на выборки
train_df, test_df = train_test_split(
    data, test_size=0.1, random_state=42, stratify=data["class"]
)

Документация к API : http://83.143.66.63:27364/docs

In [5]:
from sklearn.model_selection import train_test_split
import json
import os
from datetime import datetime

# ---------------------- Helpers ----------------------
def _to_str_id(val) -> str:
    """Нормализует любое значение id к строке.
    NaN/None/пустые -> UUID; float-целое -> без '.0'."""
    try:
        if val is None or (isinstance(val, str) and val.strip() == "") or pd.isna(val):
            return str(uuid.uuid4())
        if isinstance(val, (float, np.floating)):
            if np.isfinite(val) and float(val).is_integer():
                return str(int(val))          # 78617.0 -> "78617"
            return f"{val:.15g}"              # 7.8617e+04 -> "78617"
        if isinstance(val, (int, np.integer)):
            return str(int(val))
        return str(val).strip() or str(uuid.uuid4())
    except Exception:
        return str(uuid.uuid4())

def _safe_str(v: object, default: str) -> str:
    """Приводит значение к строке, возвращает default для None/пустых/ошибок."""
    if v is None:
        return default
    try:
        s = str(v).strip()
        return s if s else default
    except Exception:
        return default

def _truncate(s: str, max_len: int) -> str:
    if s is None:
        return s
    return s if len(s) <= max_len else s[:max_len]

def _prepare_item(row) -> dict:
    """Собирает валидный item для /upload из строки DataFrame."""
    item_id = _to_str_id(row["id"] if "id" in row else None)
    subject = _safe_str(row["subject"] if "subject" in row and not pd.isna(row["subject"]) else "no_subject", "no_subject")
    description = _safe_str(row["description"] if "description" in row and not pd.isna(row["description"]) else "no_description", "no_description")
    class_name = _safe_str(row["class"] if "class" in row and not pd.isna(row["class"]) else "Others", "Others")

    # Ограничиваем длину текстов
    subject = _truncate(subject, 500)
    description = _truncate(description, 5000)

    item = {
        "id": item_id,
        "subject": subject,
        "description": description,
        "class_name": class_name,
    }
    if "task" in row and not pd.isna(row["task"]):
        item["task"] = _safe_str(row["task"], "")
    return item

def _post_upload_items(api_url: str, headers: dict, items: list):
    """
    Пытается отправить пачку items.
    Возвращает (ok: bool, uploaded_ids: list[str], error_text: str).
    Если сервер не вернул ids, используем локальные id.
    """
    try:
        resp = requests.post(f"{api_url}/upload", json={"items": items}, headers=headers, timeout=60)
        if resp.status_code != 200:
            return False, [], f"{resp.status_code} {resp.text}"
        data = resp.json()
        if not isinstance(data, dict) or not data.get("success"):
            return False, [], f"Unexpected response: {data}"
        ids = data.get("ids")
        if isinstance(ids, list) and ids:
            return True, ids, ""
        return True, [it["id"] for it in items], ""
    except Exception as e:
        return False, [], str(e)

def _upload_with_fallback(api_url: str, headers: dict, items: list, bad_ids: list) -> list:
    """
    Загрузка с прогрессивным фолбэком: весь батч -> половинки -> одиночные.
    Возвращает список успешно загруженных id; неуспешные копит в bad_ids.
    """
    ok, ids, err = _post_upload_items(api_url, headers, items)
    if ok:
        return ids

    if len(items) == 1:
        bad_ids.append(items[0]["id"])
        return []

    mid = len(items) // 2
    left = items[:mid]
    right = items[mid:]
    uploaded_ids = []
    uploaded_ids.extend(_upload_with_fallback(api_url, headers, left, bad_ids))
    uploaded_ids.extend(_upload_with_fallback(api_url, headers, right, bad_ids))
    return uploaded_ids

# ---------------------- API calls ----------------------
def get_token(api_url, username, password):
    """Функция для получения токена авторизации"""
    try:
        response = requests.post(
            f"{api_url}/token",
            data={
                "username": username,
                "password": password,
                "scope": "predict upload search",
            },
            headers={"Content-Type": "application/x-www-form-urlencoded"},
        )

        if response.status_code != 200:
            print(f"Ошибка аутентификации: {response.text}")
            return None

        return response.json()["access_token"]
    except requests.exceptions.ConnectionError:
        print(f"Не удалось подключиться к API по адресу {api_url}")
        return None

def classify_request(subject, description, token, api_url):
    """Функция для классификации запроса"""
    headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}

    payload = {
        "id": str(uuid.uuid4()),
        "subject": _safe_str(subject if subject else "no_subject", "no_subject"),
        "description": _safe_str(description if description else "no_description", "no_description"),
    }

    try:
        response = requests.post(f"{api_url}/predict", json=payload, headers=headers, timeout=60)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Ошибка при классификации: {str(e)}")
        return None

def search_similar(subject, description, token, api_url, limit=10):
    """Поиск похожих документов на основе темы и описания"""
    headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}

    payload = {
        "id": str(uuid.uuid4()),
        "subject": _safe_str(subject if subject else "no_subject", "no_subject"),
        "description": _safe_str(description if description else "no_description", "no_description"),
        "limit": int(limit),
    }

    try:
        response = requests.post(f"{api_url}/search", json=payload, headers=headers, timeout=60)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Ошибка при поиске: {str(e)}")
        if hasattr(e, "response") and e.response is not None:
            print(f"Ответ сервера: {e.response.text}")
        return None

def clear_index(token, api_url):
    """Очистка индекса перед загрузкой новых данных"""
    headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}
    try:
        response = requests.post(f"{api_url}/clear_index", headers=headers, timeout=60)
        if response.status_code != 200:
            print(f"Ошибка при очистке индекса: {response.text}")
            return False

        result = response.json()
        if result.get("success"):
            print("Индекс успешно очищен")
            return True
        else:
            print(f"API вернул неожиданный ответ при очистке индекса: {result}")
            return False
    except Exception as e:
        print(f"Ошибка при очистке индекса: {str(e)}")
        return False

def upload_data(data, token, api_url):
    """Загрузка данных в систему (устойчиво; id всегда строка; fallback при ошибках)."""
    headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}

    # Базовый размер батча; при проблемах fallback дробит дальше
    batch_size = 50
    total_batches = len(data) // batch_size + (1 if len(data) % batch_size > 0 else 0)

    uploaded_ids_total: list[str] = []
    bad_ids_total: list[str] = []

    for i in tqdm(range(total_batches)):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(data))
        batch_data = data.iloc[start_idx:end_idx]

        # Сбор и нормализация элементов
        items = []
        seen_ids = set()
        for _, row in batch_data.iterrows():
            item = _prepare_item(row)
            # уникальность id в рамках батча
            if item["id"] in seen_ids:
                item["id"] = f"{item['id']}-{uuid.uuid4()}"
            seen_ids.add(item["id"])
            items.append(item)

        # Пытаемся загрузить; при ошибке дробим
        uploaded_ids = _upload_with_fallback(api_url, headers, items, bad_ids_total)
        uploaded_ids_total.extend(uploaded_ids)

        if len(uploaded_ids) != len(items):
            print(f"Батч {i + 1}/{total_batches}: загружено {len(uploaded_ids)}/{len(items)}, "
                       f"пропущено {len(items) - len(uploaded_ids)}")


    if bad_ids_total:
        print(f"Пропущено записей: {len(bad_ids_total)}. Проблемные id (первые 50): {bad_ids_total[:50]}")

    print(f"Загружено {len(uploaded_ids_total)} записей")
    return uploaded_ids_total

def predict(data, token, api_url):
    """Получение предсказаний (устойчиво к пустым/неверным ответам)."""
    headers = {"Authorization": f"Bearer {token}", "Content-Type": "application/json"}

    predictions = []
    total_rows = len(data)
    empty_cnt = 0
    err_cnt = 0

    for i, (index, row) in enumerate(tqdm(data.iterrows(), total=len(data))):
        try:
            payload = {
                "id": str(uuid.uuid4()),
                "subject": _safe_str(row["subject"] if "subject" in row and not pd.isna(row["subject"]) else "no_subject", "no_subject"),
                "description": _safe_str(row["description"] if "description" in row and not pd.isna(row["description"]) else "no_description", "no_description"),
            }

            response = requests.post(f"{api_url}/predict", json=payload, headers=headers, timeout=60)
            response.raise_for_status()
            result = response.json()

            preds = result.get("predictions")
            # допускаем словарь, список или пусто
            if preds is None:
                predictions.append(None); empty_cnt += 1
            elif isinstance(preds, dict):
                top = preds.get("class_name")
                predictions.append(top if top is not None else None)
                if top is None: empty_cnt += 1
            elif isinstance(preds, list) and len(preds) > 0 and isinstance(preds[0], dict):
                top = preds[0].get("class_name")
                predictions.append(top if top is not None else None)
                if top is None: empty_cnt += 1
            else:
                predictions.append(None); empty_cnt += 1

        except requests.exceptions.RequestException:
            err_cnt += 1
            predictions.append(None)
        except Exception:
            err_cnt += 1
            predictions.append(None)


    if empty_cnt or err_cnt:
        print(f"Предупреждение: пустых/ошибочных ответов {empty_cnt + err_cnt} из {total_rows} "
                   f"(empty={empty_cnt}, errors={err_cnt})")

    return predictions


In [6]:
api_url = "http://83.143.66.63:27364"

### Авторизация

In [None]:
token = get_token(api_url, 'admin', 'secret')

'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJhZG1pbiIsInNjb3BlcyI6WyJwcmVkaWN0IiwidXBsb2FkIiwic2VhcmNoIl19.Q1kYV-1fvqtlHSY-ii1VGZH2Xz-zsa8EMcG1iBjpths'

### Очистка бд

In [7]:
clear_index(token, api_url);

Индекс успешно очищен


### Загрузка бд

In [8]:
upload_data(train_df, token, api_url);

100%|██████████| 98/98 [03:48<00:00,  2.33s/it]

Загружено 4897 записей





### Формирование пар расхождений

In [18]:
rows = []

for idx, item in tqdm(test_df.iterrows(), total=len(test_df)):
    similars = search_similar(item.subject, item.description, token, api_url)
    for similar in similars['results']:
        if similar['class_name'] != item['class']:
            rows.append([
                idx, similar['id'], item['description'], 
                similar['description'], item['class'], similar['class_name']
                ])

res = pd.DataFrame(
    rows,
    columns=["item_id", "similar_id", "item_description", "similar_description", "item_class", "similar_class"]
).drop_duplicates(['item_description', 'similar_description', 'item_class', 'similar_class'])
res

100%|██████████| 545/545 [00:30<00:00, 18.11it/s]


Unnamed: 0,item_id,similar_id,item_description,similar_description,item_class,similar_class
0,68492,befe11cc-c207-5195-8bd1-e95e77fa42b5,Limit increse,Limit increse,Others | Limit increase,Others
1,67269,6a45e924-cfd7-580d-9c0b-50dd45d4b74a,Payment done last month still limit not updated,Payment done but limit not updated,EMI Payment Related | Repayment done on / afte...,EMI Payment Related | Repayment done before du...
3,67269,4e83f567-44aa-5d60-8741-36651ad00d6e,Payment done last month still limit not updated,Payment done but limit not updated,EMI Payment Related | Repayment done on / afte...,"EMI Payment Related | Repayment done, limit no..."
11,79981,3378d2f3-d14e-522c-95a0-3ae52636a4b1,how to apply loan,Error for apply loan,Others | General query,Rejected | Policy Reject
12,79981,846b5356-e670-5da0-8b28-3503b05ce9be,how to apply loan,How to apply,Others | General query,Others | Could not find customer
...,...,...,...,...,...,...
2807,72109,0a085303-a594-5308-b3dc-9eb2940cce69,Please,Can you please?,Others,Others | Limit increase
2808,72109,a030d252-75fd-57f0-abef-2364b0339ffa,Please,Please need help,Others,Others | General query
2809,72109,4478cb39-8c3f-5644-b3f6-d9ae423f88ff,Please,Please resolve,Others,Others | Repayment done before due date
2810,72109,1a19d20d-242b-5d62-92ea-2c7cdcf46dd9,Please,Please help,Others,Fund Transfer Related | Withdrawal issue


In [19]:
res.to_csv('data/similars_errors.csv')