In [1]:
import numpy as np
import pandas as pd
from annoy import AnnoyIndex
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from tqdm import tqdm

RANDOM_SEED = 47

# Гиперпараметры
n_trees_list = [10, 15, 20, 25, 30, 40, 45, 50]
top_k_list = [3, 4, 5, 6, 7, 8, 9, 10]
metric = 'angular'

train = pd.read_csv('../train_final_processed.csv')
X = train.drop(columns=["is_canceled"])
y = train["is_canceled"]

# Разделение на train/valid
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

# Преобразования
def transform_data(X_train, X_valid, method):
    if method == "raw":
        return X_train, X_valid
    elif method == "scaled":
        scaler = StandardScaler()
        return scaler.fit_transform(X_train), scaler.transform(X_valid)
    elif method == "pca":
        pca = PCA(n_components=X_train.shape[1])
        return pca.fit_transform(X_train), pca.transform(X_valid)
    elif method == "scaled+pca":
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_valid_scaled = scaler.transform(X_valid)
        pca = PCA(n_components=X_train.shape[1])
        return pca.fit_transform(X_train_scaled), pca.transform(X_valid_scaled)

# Annoy-модель
def train_annoy_model(X_train_ann, n_trees=10, metric='angular'):
    X_np = X_train_ann.to_numpy() if isinstance(X_train_ann, pd.DataFrame) else X_train_ann
    dim = X_np.shape[1]
    annoy_index = AnnoyIndex(dim, metric)
    for i in range(len(X_np)):
        annoy_index.add_item(i, X_np[i])
    annoy_index.build(n_trees)
    return annoy_index

def predict_annoy(index, X_valid, y_train, top_k=5):
    X_np = X_valid.to_numpy() if isinstance(X_valid, pd.DataFrame) else X_valid
    preds = []
    for v in tqdm(X_np, disable=True):
        idxs = index.get_nns_by_vector(v, top_k)
        votes = [y_train.iloc[i] for i in idxs]
        pred = max(set(votes), key=votes.count)
        preds.append(pred)
    return np.array(preds)

# Основной цикл
for method in ['raw', 'scaled', 'pca', 'scaled+pca']:
    print(f"\nМетод: {method}")
    X_tr, X_val = transform_data(X_train, X_valid, method)

    best_acc = 0
    best_params = (None, None)

    for n_trees in n_trees_list:
        for top_k in top_k_list:
            # Обучение
            annoy = train_annoy_model(X_tr, n_trees=n_trees, metric=metric)

            # Предсказание
            y_pred = predict_annoy(annoy, X_val, y_train, top_k=top_k)

            # Оценка
            acc = accuracy_score(y_valid, y_pred)
            print(f"n_trees={n_trees}, top_k={top_k} => acc={acc:.4f}")

            if acc > best_acc:
                best_acc = acc
                best_params = (n_trees, top_k)

    print(f"Лучшие параметры для {method}: n_trees={best_params[0]}, top_k={best_params[1]} => acc={best_acc:.4f}")



Метод: raw
n_trees=10, top_k=3 => acc=0.7961
n_trees=10, top_k=4 => acc=0.8097
n_trees=10, top_k=5 => acc=0.7983
n_trees=10, top_k=6 => acc=0.8053
n_trees=10, top_k=7 => acc=0.7958
n_trees=10, top_k=8 => acc=0.8005
n_trees=10, top_k=9 => acc=0.7957
n_trees=10, top_k=10 => acc=0.8002
n_trees=15, top_k=3 => acc=0.7953
n_trees=15, top_k=4 => acc=0.8106
n_trees=15, top_k=5 => acc=0.7986
n_trees=15, top_k=6 => acc=0.8053
n_trees=15, top_k=7 => acc=0.7949
n_trees=15, top_k=8 => acc=0.8002
n_trees=15, top_k=9 => acc=0.7959
n_trees=15, top_k=10 => acc=0.8012
n_trees=20, top_k=3 => acc=0.7949
n_trees=20, top_k=4 => acc=0.8101
n_trees=20, top_k=5 => acc=0.7978
n_trees=20, top_k=6 => acc=0.8053
n_trees=20, top_k=7 => acc=0.7950
n_trees=20, top_k=8 => acc=0.8003
n_trees=20, top_k=9 => acc=0.7960
n_trees=20, top_k=10 => acc=0.8013
n_trees=25, top_k=3 => acc=0.7950
n_trees=25, top_k=4 => acc=0.8098
n_trees=25, top_k=5 => acc=0.7985
n_trees=25, top_k=6 => acc=0.8053
n_trees=25, top_k=7 => acc=0.7939