# Set Up

## Utilities

In [1]:
!pip install numpy==1.26.4
!mkdir -p Assets/models



In [2]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import accuracy_score, f1_score


def roc_auc_multiclass(labels, probabilities):
    """
    Compute the multi-class ROC AUC using the One-Versus-Rest approach.

    Parameters:
    - labels: (955,) array with true class indices (values from 0 to 108)
    - probabilities: (955, 109) array with predicted probabilities for each class

    Returns:
    - Macro-averaged ROC AUC score
    """
    num_classes = probabilities.shape[1]  # Should be 109 classes
    labels_one_hot = np.eye(num_classes)[labels]  # Convert labels to one-hot encoding (955, 109)

    aucs = []  # List to store AUC for each class

    for i in range(num_classes):
        # True labels for class i (binary: 1 if true class, 0 otherwise)
        y_true = labels_one_hot[:, i]  # Shape: (955,)

        # Predicted probabilities for class i
        y_score = probabilities[:, i]  # Shape: (955,)

        # **Check if class i is missing in the batch**
        num_positives = np.sum(y_true)
        num_negatives = len(y_true) - num_positives

        if num_positives == 0 or num_negatives == 0:
            aucs.append(0.5)  # If only one class is present, set AUC to 0.5 (random chance)
            continue  # Skip further computation

        # Sort by predicted score (descending order)
        sorted_indices = np.argsort(-y_score)
        y_true_sorted = y_true[sorted_indices]

        # Compute TPR and FPR
        cum_positive = np.cumsum(y_true_sorted)
        cum_negative = np.cumsum(1 - y_true_sorted)

        TPR = cum_positive / num_positives  # True positive rate
        FPR = cum_negative / num_negatives  # False positive rate

        auc = np.trapz(TPR, FPR)  # Compute AUC using trapezoidal rule
        aucs.append(auc)

    return np.mean(aucs)  # Macro-averaged AUC over all classes

## Load Data

In [30]:
ALL_DATA = {
    "FastText": {},
    "Naive Bayes": {},
    "Decision Tree": {},
    "Support Vector": {}
    }

data = pd.read_csv('Database/clean_data.csv', sep=';', encoding='utf-8')

with open("Assets/converter.json", "r", encoding="utf-8") as f:
    converter = json.load(f)

label2id = converter["label2id_reduced"]
id2label = converter["id2label_reduced"]

data['label'] = data["detailed_topic"].map(label2id)

num_labels = len(label2id)

data.head()

Unnamed: 0,detailed_topic,appeal,label
0,"содержание л/клеток, дворовых территорий",ЗАЯВЛЕНИЕ нарушение периодичности проведения ...,1
1,ПРАВИЛА ПОЛЬЗОВАНИЯ ЖИЛЫМИ ПОМЕЩЕНИЯМИ (ПЕРЕПЛ...,"\nДобрый день,Прошу рассмотреть по существу жа...",11
2,фасады,"\nИнформирую вас, что с 08.04.2024 в МЖД по ад...",12
3,"содержание л/клеток, дворовых территорий","\nКоллективная жалоба на факт нарушения ТСЖ""Ко...",1
4,подвалы,Прошу рассмотреть прилагающееся заявление о в...,14


# FastText

In [4]:
! pip install fasttext



## Prepare Data

In [5]:
from sklearn.model_selection import train_test_split


# topic_id -> sequential label index
data["fasttext_label"] = "__label__" + data["label"].astype(str) + " " + data["appeal"]

train_texts_fasttext, val_texts_fasttext = train_test_split(data["fasttext_label"], test_size=0.2, random_state=42)

# Save to .txt files (FastText needs these)
train_texts_fasttext.to_csv("temp/train_fasttext.txt", index=False, header=False)
val_texts_fasttext.to_csv("temp/valid_fasttext.txt", index=False, header=False)

## Run Training

In [6]:
import fasttext

# Train FastText model
model = fasttext.train_supervised(input="temp/train_fasttext.txt", epoch=100, lr=0.5, wordNgrams=2, verbose=2)

# Save model
model.save_model("Assets/models/fasttext_model.bin")

## Load and Evaluate

In [7]:
loaded_model = fasttext.load_model("Assets/models/fasttext_model.bin")
val_texts_fasttext.head()

Unnamed: 0,fasttext_label
794,__label__0 25.09.2024 в 07:24 я направил в уп...
199,"__label__2 Здравствуйте, прошу Вас провести п..."
1849,__label__20 По адресу ул. Орбели д. 17 лит. А....
33,__label__6 Прошу принять меры в отношении орг...
179,__label__6 Я проживаю и являюсь собственником...


In [8]:
def predict_topic_fasttext(appeal_text):
    label, confidence = model.predict(appeal_text)  # Predict topic label

    topic_label = id2label.get(int(label[0].replace("__label__", "")), "Неизвестная тема")  # Remove "__label__" prefix
    return topic_label, confidence[0]  # Return topic_id, label and confidence score

In [9]:
def get_fasttext_predictions(model, test_data, num_classes):
    """
    Get full probability distributions for all classes from FastText model.

    Parameters:
    - model: Trained FastText model
    - test_data: List of (text, true_label) tuples
    - num_classes: Total number of classes

    Returns:
    - predictions: List of tuples [(full_probs), true_label]
    """
    predictions = []

    for text, true_label in test_data:
        # Get top-k predictions (FastText does not return full distribution)
        predicted_labels, probs = model.predict(text, k=num_classes)

        # Initialize full probability vector with zeros
        full_probs = np.zeros(num_classes)

        # Fill in the probabilities for predicted classes
        for lbl, prob in zip(predicted_labels, probs):
            class_idx = int(lbl.replace("__label__", ""))  # Extract class index
            full_probs[class_idx] = prob  # Assign probability to correct index

        predictions.append((full_probs, int(true_label)))  # Store full vector + true label

    return predictions

def parse_fasttext_output(predictions):
    """
    Convert FastText probability output into log probabilities & labels.

    Parameters:
    - predictions: List of tuples [(full_probs), true_label] from FastText

    Returns:
    - logits: Numpy array of shape (N, num_classes) with log-probabilities
    - labels: Numpy array of shape (N,) with true class indices
    """
    labels = []
    logits = []

    for full_probs, true_label in predictions:
        labels.append(true_label)

        # Convert probabilities to logits (avoid log(0) using a small value)
        log_probs = np.log(np.clip(full_probs, 1e-10, 1.0))
        logits.append(log_probs)

    return np.array(logits), np.array(labels)

def compute_metrics_top_k_fasttext(predictions, num_classes, k=3):
    """
    Compute accuracy, top-K accuracy, and ROC AUC for FastText model output.

    Parameters:
    - predictions: List of tuples [(full_probs, true_label)] from FastText
    - num_classes: Total number of classes
    - k: Number of top predictions to consider for accuracy

    Returns:
    - Dictionary with accuracy, top-K accuracy, and ROC AUC
    """
    logits, labels = parse_fasttext_output(predictions)

    # Convert logits back to probabilities
    probabilities = np.exp(logits) / np.exp(logits).sum(axis=-1, keepdims=True)

    # Standard Accuracy
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)

    # Top-K Accuracy
    top_k_predictions = np.argsort(-probabilities, axis=-1)[:, :k]
    top_k_correct = np.any(top_k_predictions == labels[:, None], axis=-1)
    top_k_acc = np.mean(top_k_correct)

    # Top-5 Accuracy
    top_5_predictions = np.argsort(-probabilities, axis=-1)[:, :5]
    top_5_correct = np.any(top_5_predictions == labels[:, None], axis=-1)
    top_5_acc = np.mean(top_5_correct)

    # ROC AUC
    auc = roc_auc_multiclass(labels, probabilities)

    f1 = f1_score(labels, predictions, average="weighted")  # Standard F1-score

    return {"eval_accuracy": acc, f"eval_top_{k}_accuracy": top_k_acc, f"eval_top_5_accuracy": top_5_acc, "eval_f1": f1, "eval_roc_auc": auc}

In [10]:
test_data = val_texts_fasttext.tolist()
for i in range(len(test_data)):
    text = test_data[i]
    label = int(text.split()[0][9:])
    appeal = " ".join(text.split()[1:])
    test_data[i] = (appeal, label)

num_classes = num_labels

predictions = get_fasttext_predictions(loaded_model, test_data, num_classes=num_classes)

# Compute metrics
metrics = compute_metrics_top_k_fasttext(predictions, num_labels, k=3)
fasttext_stat = pd.DataFrame.from_dict(metrics, orient='index')
fasttext_stat

Unnamed: 0,0
eval_accuracy,0.149738
eval_top_3_accuracy,0.26178
eval_top_5_accuracy,0.343455
eval_f1,0.102984
eval_roc_auc,0.626788


In [31]:
ALL_DATA["FastText"]["basic_stats"] = metrics
ALL_DATA

{'FastText': {'basic_stats': {'eval_accuracy': 0.14973821989528796,
   'eval_top_3_accuracy': 0.2617801047120419,
   'eval_top_5_accuracy': 0.34345549738219894,
   'eval_f1': 0.10298423933533439,
   'eval_roc_auc': 0.6267878645721024}},
 'Naive Bayes': {},
 'Decision Tree': {},
 'Support Vector': {}}

# Basic ML Models

## Prepare Data

In [14]:
! pip install scikit-multilearn

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl.metadata (6.0 kB)
Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m81.9/89.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


train_texts, val_texts, train_labels, val_labels = train_test_split(
    data["appeal"].tolist(), data["label"].tolist(), test_size=0.2, random_state=42
)

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(val_texts)

svd = TruncatedSVD(n_components=1000)  # Reduce to less-dimensional space
X_train_reduced = svd.fit_transform(X_train)
X_test_reduced = svd.transform(X_test)

X_train.shape, X_test.shape, X_train_reduced.shape, X_test_reduced.shape

((3817, 5000), (955, 5000), (3817, 1000), (955, 1000))

In [16]:
from sklearn.preprocessing import MultiLabelBinarizer

train_labels_mlb = [[i] for i in train_labels]

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_labels_mlb)
#y_test = mlb.transform(val_labels)
y_test = np.array(val_labels)

## Import Models

In [18]:
import joblib
import os

from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

predictions = {}

## Train and Save Models

*Using Binary Relevance with Naïve Bayes, Decision Tree and Support Vector Machine*

In [22]:
import joblib
import os

os.makedirs("Assets/models", exist_ok=True)

from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Store model configs
model_configs = {
    "Naive Bayes": BinaryRelevance(MultinomialNB()),
    "Decision Tree": BinaryRelevance(DecisionTreeClassifier()),
    "Support Vector": BinaryRelevance(SVC(probability=True))
}

# Fit and save models
for name, clf in model_configs.items():
    if name == "Support Vector":
        clf.fit(X_train_reduced, y_train)
        joblib.dump(clf, f"Assets/models/{name.replace(' ', '_')}.joblib")
        predictions[name] = clf.predict_proba(X_test_reduced).toarray()
    else:
        clf.fit(X_train, y_train)
        joblib.dump(clf, f"Assets/models/{name.replace(' ', '_')}.joblib")
        predictions[name] = clf.predict_proba(X_test).toarray()

# Evaluate

In [23]:
def compute_metrics_top_k(eval_pred, k=3, KNN=False):
    """
    Compute accuracy, top-K accuracy, and ROC AUC for multi-class classification.

    Parameters:
    - eval_pred: Tuple (logits, labels), where:
      - logits: (N, num_classes) array of raw model outputs (before softmax)
      - labels: (N,) array of true class indices
    - k: Number of top predictions to consider for accuracy

    Returns:
    - Dictionary with accuracy, top-K accuracy, and ROC AUC
    """
    logits, labels = eval_pred  # Unpack logits (raw scores) and true labels
    probabilities = np.exp(logits) / np.exp(logits).sum(axis=-1, keepdims=True)  # Softmax

    # Top-1 (standard accuracy)
    predictions = np.argmax(logits, axis=-1)  # Get class with highest probability
    acc = accuracy_score(labels, predictions)  # Standard accuracy

    # Top-K Accuracy Calculation
    top_k_predictions = np.argsort(-probabilities, axis=-1)[:, :k]  # Get top-K predicted classes
    top_k_correct = np.any(top_k_predictions == labels[:, None], axis=-1)  # Check if true label is in top-K
    top_k_acc = np.mean(top_k_correct)  # Compute top-K accuracy

     # Top-5 Accuracy (Calculate-Ability of Classes)
    top_5_predictions = np.argsort(-probabilities, axis=-1)[:, :5]
    top_5_correct = np.any(top_5_predictions == labels[:, None], axis=-1)
    top_5_acc = np.mean(top_5_correct)

    # ROC AUC Calculation
    if not KNN:
        auc = roc_auc_multiclass(labels, probabilities)  # Compute multi-class AUC
    else:
        auc = None

    # F1-score Calculation
    f1 = f1_score(labels, predictions, average="weighted")  # Standard F1-score

    return {
        "eval_accuracy": acc,
        f"eval_top_{k}_accuracy": top_k_acc,
        "eval_top_5_accuracy": top_5_acc,
        "eval_roc_auc": auc,
        "eval_f1": f1,
    }

In [32]:
for model_name, preds in predictions.items():
    ALL_DATA[model_name]["basic_stats"] = compute_metrics_top_k((preds, y_test))

In [33]:
ALL_DATA

{'FastText': {'basic_stats': {'eval_accuracy': 0.14973821989528796,
   'eval_top_3_accuracy': 0.2617801047120419,
   'eval_top_5_accuracy': 0.34345549738219894,
   'eval_f1': 0.10298423933533439,
   'eval_roc_auc': 0.6267878645721024}},
 'Naive Bayes': {'basic_stats': {'eval_accuracy': 0.4701570680628272,
   'eval_top_3_accuracy': 0.7141361256544503,
   'eval_top_5_accuracy': 0.7539267015706806,
   'eval_roc_auc': 0.7691105568688378,
   'eval_f1': 0.3633825670909665}},
 'Decision Tree': {'basic_stats': {'eval_accuracy': 0.4649214659685864,
   'eval_top_3_accuracy': 0.5424083769633508,
   'eval_top_5_accuracy': 0.5497382198952879,
   'eval_roc_auc': 0.7087739419904361,
   'eval_f1': 0.43417829769544297}},
 'Support Vector': {'basic_stats': {'eval_accuracy': 0.6513089005235602,
   'eval_top_3_accuracy': 0.8282722513089005,
   'eval_top_5_accuracy': 0.881675392670157,
   'eval_roc_auc': 0.9063684203991289,
   'eval_f1': 0.6362458090842334}}}

# Download Outputs

In [35]:
import pickle

with open("Assets/classic_ml_outputs.pkl", "wb") as f:
    pickle.dump(ALL_DATA, f)