In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import label_ranking_average_precision_score
from codecarbon import EmissionsTracker

In [2]:
dataset = "coveo"   # coveo, diginetica, rees46, retailrocket, yoochoose
dataset_path = f"../datasets/{dataset}"

In [3]:
train_path = os.path.join(dataset_path,f"{dataset}_processed_view_train_full.tsv")
test_path = os.path.join(dataset_path,f"{dataset}_processed_view_test.tsv")

In [4]:
def load_data(train_path, test_path):
    train_data = pd.read_csv(train_path, delimiter='\t')
    test_data = pd.read_csv(test_path, delimiter='\t')
    return train_data, test_data

def preprocess_data(train_data, test_data, k=20):
    X_train = train_data.drop(columns=['SessionId', 'ItemId', 'Time']).values.astype(np.float32)
    y_train = train_data['ItemId'].values.astype(np.float32)
    X_test = test_data.drop(columns=['SessionId', 'ItemId', 'Time']).values.astype(np.float32)
    y_test = test_data['ItemId'].values.astype(np.float32)

    # Imputar valores faltantes con KNNImputer
    imputer = KNNImputer(n_neighbors=k)
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)

    # Escalar los datos
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Reducir dimensionalidad con PCA
    pca = PCA(n_components=100)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)

    return X_train, y_train, X_test, y_test

def train_knn(X_train, y_train):
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)
    return knn

def evaluate_model(model, X_test, y_test, output_file):
    y_pred = model.predict(X_test)
    recall = recall_score(y_test, y_pred, average='macro')
    
    # Calcular Mean Reciprocal Rank (MRR)
    y_test_binary = label_binarize(y_test, classes=np.unique(y_test))
    y_pred_proba = model.predict_proba(X_test)
    mrr = label_ranking_average_precision_score(y_test_binary, y_pred_proba)

    with open(output_file, 'w') as f:
        f.write(f'Recall: {recall}\n')
        f.write(f'MRR: {mrr}\n')

In [5]:
train_data, test_data = load_data(train_path, test_path)
X_train, y_train, X_test, y_test = preprocess_data(train_data, test_data)

codecarbon_tracker = EmissionsTracker()
codecarbon_tracker.start()
knn_model = train_knn(X_train, y_train)
emissions = codecarbon_tracker.stop()
print(f"CO2 emissions: {emissions} kg")

evaluate_model(knn_model, X_test, y_test, f'./knn_sklearn_{dataset}.txt')

ValueError: Found array with 0 feature(s) (shape=(1411113, 0)) while a minimum of 1 is required by KNNImputer.