In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from tqdm.notebook import tqdm

import faiss

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from catboost import Pool, CatBoostClassifier

In [2]:
DATA_PATH = "C:\\Users\\Natalia\\Desktop\\Masterskaya\\"

In [3]:
df_base = pd.read_csv(f"{DATA_PATH}base.csv", index_col=0)
df_train = pd.read_csv(f"{DATA_PATH}train.csv", index_col=0)
df_validation = pd.read_csv(f"{DATA_PATH}validation.csv", index_col=0)
df_validation_answer = pd.read_csv(f"{DATA_PATH}validation_answer.csv", index_col=0)

In [4]:
targets = df_train["Target"]
df_train.drop("Target", axis=1, inplace=True)

In [5]:
df_base.drop(["21", "25", "33", "44", "59", "65", "70"], axis=1, inplace=True)
df_train.drop(["21", "25", "33", "44", "59", "65", "70"], axis=1, inplace=True)
df_validation.drop(["21", "25", "33", "44", "59", "65", "70"], axis=1, inplace=True)

# Нормализация данных 

In [None]:
# Нормализация данных
def normalize_with_indices(df):
    original_indices = df.index
    
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    df_normalized = pd.DataFrame(df_scaled, index=original_indices, columns=df.columns)
    return df_normalized

In [None]:
df_base_scaled = normalize_with_indices(df_base)
df_train_scaled  = normalize_with_indices(df_train)
df_validation_scaled = normalize_with_indices(df_validation)

In [None]:
dims = df_base_scaled.shape[1]
n_cells = 10

quantizer = faiss.IndexFlatL2(dims)
idx_l2 = faiss.IndexIVFFlat(quantizer, dims, n_cells)

In [None]:
idx_l2.train(np.ascontiguousarray(df_base_scaled).astype('float32'))

idx_l2.add(np.ascontiguousarray(df_base_scaled).astype('float32'))

In [None]:
base_index = {k: v for k, v in enumerate(df_base_scaled.index.to_list())}

In [None]:
%%time
k_neighbours = 100
vecs, idx = idx_l2.search(np.ascontiguousarray(df_train_scaled.values).astype('float32'), k_neighbours)

In [None]:
acc = 0
for target, el in zip(targets.values.tolist(), idx.tolist()):
    acc += int(target in [base_index[r] for r in el])

print(100 * acc / len(idx))

In [None]:
print(train_data_normalized

In [None]:
# Создание Faiss индекса
dims = base_data_normalized.shape[1]
n_cells = 15  # Количество ячеек
quantizer = faiss.IndexFlatL2(dims)
idx_l2 = faiss.IndexIVFFlat(quantizer, dims, n_cells)
idx_l2.train(np.ascontiguousarray(df_base.values[:50000, :]).astype('float32'))
idx_l2.add(np.ascontiguousarray(df_base.values).astype('float32'))
base_index = {k: v for k, v in enumerate(df_base.index.to_list())}

In [None]:
# targets = df_train["Target"]
# df_train.drop("Target", axis=1, inplace=True)

In [None]:
# Получение 10 ближайших соседей на первом этапе
k_first_stage = 10
train_nearest_indices_first_stage, idx = idx_l2.search(
    np.ascontiguousarray(df_train.values).astype('float32'), k_first_stage)

In [None]:
acc = 0
for target, el in zip(targets.values.tolist(), idx.tolist()):
    acc += int(target in [base_index[r] for r in el])

print(100 * acc / len(idx))

In [None]:
# Создание датасета для второго этапа CatBoost
train_data_catboost = []
train_labels_catboost = []
for idx, neighbors in zip(df_train.index, train_nearest_indices_first_stage):
    train_data_catboost.extend(neighbors)
    train_labels_catboost.extend([idx] * len(neighbors))

train_data_catboost = np.column_stack((np.array(train_labels_catboost), np.array(train_data_catboost)))
train_labels_catboost = np.array(train_labels_catboost)

train_pool_catboost = Pool(data=train_data_catboost, label=train_labels_catboost)

In [None]:
# Обучение модели CatBoost для второго этапа
model_catboost = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=10)
model_catboost.fit(train_pool_catboost)

In [None]:
# Получение 5 ближайших соседей для валидационной выборки
k_second_stage = 5
validation_nearest_indices, _ = idx_l2.search(
    np.ascontiguousarray(df_validation.values).astype('float32'), k_second_stage)

In [None]:
# Создание датасета для второго этапа CatBoost
validation_data_catboost = np.column_stack(
    (np.repeat(np.arange(len(df_validation)), k_second_stage), validation_nearest_indices.flatten()))

In [None]:
# Получение предсказаний для валидационной выборки
validation_predictions = model_catboost.predict_proba(validation_data_catboost)[:, 1]

In [None]:
# Преобразование индексов в исходные идентификаторы
validation_indices = df_validation.index
validation_ids = [base_index[idx] for idx in validation_indices]

In [None]:
# Создание DataFrame с предсказаниями
validation_results = pd.DataFrame({'Id': validation_ids, 'Predicted': validation_predictions})
validation_results = validation_results.set_index('Id')

In [None]:
# Сортировка ближайших соседей по убыванию предсказанных вероятностей
validation_results['Top5Matches'] = validation_results['Predicted'].groupby('Id').apply(
    lambda group: group.sort_values(ascending=False).index[:k_second_stage].tolist())

In [None]:
# Объединение предсказаний с правильными ответами
validation_results['Expected'] = df_validation_answers['Expected']


In [None]:
# Подсчет accuracy@5
validation_results['Top5Accuracy'] = validation_results.apply(
    lambda row: int(row['Expected'] in row['Top5Matches']), axis=1)

accuracy_at_5 = validation_results['Top5Accuracy'].mean()
print(f'Accuracy@5: {accuracy_at_5:.4f}')