In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

from datasets import load_dataset
from mfh.multi_feature_hashing import MultiFeatureHasher, dense_X, apply_mfh
from models import train_and_evaluate_model
from concurrency import parallel_map, parallel_map_2d

NUM_HASHES = 3
DATASET_NAME = 'news_category'
K_VALS = [10, 20]
# K_VALS = [2**i for i in range(6, 15)]

In [2]:
X, y = load_dataset(DATASET_NAME)

In [3]:
vec = TfidfVectorizer(stop_words='english')

X_bow = vec.fit_transform(X)
feature_names = vec.get_feature_names_out()
N, D = X_bow.shape
print(f'Number of samples: {N}, number of features: {D}')

Number of samples: 209527, number of features: 318358


In [4]:
from concurrent.futures import ThreadPoolExecutor
from typing import Callable, List, Any

def parallel_map(func: Callable, iterable: List[Any], *args, **kwargs) -> List[Any]:
    """
    Map a function to an iterable in parallel
    :param func: Function to map
    :param iterable: Iterable to map the function to
    :param args: Positional arguments for the function
    :param kwargs: Keyword arguments for the function
    :return: List of results
    """    
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(lambda x: func(x, *args, **kwargs), iterable))
    return results

In [5]:
mfhs = np.array([np.array([MultiFeatureHasher(feature_names=feature_names,
                                              n_features=K,
                                              n_hashes=n_hashes)
                           for n_hashes in range(1, NUM_HASHES + 1)])
                 for K in K_VALS])

X_dense_bow = dense_X(X_bow, feature_names)
print(f'Average number of non-zero features per sample: {np.average([len(x) for x in X_dense_bow]):.2f}')

Average number of non-zero features per sample: 29.26


In [6]:
hashed_Xs = parallel_map_2d(apply_mfh, mfhs, X_dense_bow)

### SGD Classifier

In [7]:
baseline_accuracy = train_and_evaluate_model(X_bow, y, model='sgd')
print(f"Accuracy without dimensionality reduction ({D} features): {baseline_accuracy:.4f}")

Accuracy without dimensionality reduction (318358 features): 0.5080


In [None]:
sgd_accuracies = parallel_map_2d(train_and_evaluate_model, hashed_Xs, y, 'sgd')
for K, accuracies in zip(K_VALS, sgd_accuracies):
    for i, accuracy in enumerate(accuracies):
        print(f"Accuracy with {K} features and {i+1} hash(es): {accuracy:.4f}")

# for K, Xs_k in zip(K_VALS, Xs):
#     print(f"Training with {K} features")
#     accuracies = parallel_map(train_and_evaluate_model, Xs_k, y, 'sgd')
#     sgd_accuracies[K_VALS.index(K)] = accuracies
#     for i, accuracy in enumerate(accuracies):
#         print(f"Accuracy with {K} features and {i+1} hash(es): {accuracy:.4f}")

In [7]:
Ks = [2**i for i in range(6, 18)] # Number of components s.t. K << D
Hs = range(1, 6) # Number of hashes for multi-feature hashing

mfh_accuracies = np.zeros((len(Ks), len(Hs)))

for K in Ks:
    print(f"K: {K}")
    # Feature Hashing
    # hasher = FeatureHasher(n_features=K, input_type='pair')
    # X_hashed = hasher.transform(X_hasher_input)
    # X_train_hashed, X_test_hashed, y_train, y_test = train_test_split(X_hashed, y, test_size=0.3, random_state=42)
    # fh_accuracy = train_and_evaluate_model_sgd(X_train_hashed, X_test_hashed, y_train, y_test)
    # fh_accuracies.append(fh_accuracy)
    
    # Multi-Feature Hashing
    Xs = multi_feature_hashing(X_hasher_input, K)
    train_test_tuples = [train_test_split(X, y, test_size=0.3, random_state=42) for X in Xs]
    accuracies = train_and_evaluate_model_sgd_parallel(train_test_tuples)
    for H in Hs:
        print(f"H: {H}, Accuracy: {accuracies[H-1]:.4f}")

    mfh_accuracies[Ks.index(K), :] = accuracies
    
    
    # Cuckoo Feature Hashing
    # X_cfh = cuckoo_feature_hashing(X_hasher_input, K, 10)
    # X_train_cfh, X_test_cfh, y_train, y_test = train_test_split(X_cfh, y, test_size=0.3, random_state=42)
    # cfh_accuracy = train_and_evaluate_model_sgd(X_train_cfh, X_test_cfh, y_train, y_test)
    # cfh_accuracies.append(cfh_accuracy)
    
    
    # PCA
    # pca = PCA(n_components=K)
    # X_pca = pca.fit_transform(X_BoW)
    # X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)
    # pca_accuracy = train_and_evaluate_model_sgd(X_train_pca, X_test_pca, y_train, y_test)
    # pca_accuracies.append(pca_accuracy)
    
    # Sparse Random Projection
    # srp = SparseRandomProjection(n_components=K)
    # X_srp = srp.fit_transform(X_BoW)
    # X_train_srp, X_test_srp, y_train, y_test = train_test_split(X_srp, y, test_size=0.3, random_state=42)
    # srp_accuracy = train_and_evaluate_model_sgd(X_train_srp, X_test_srp, y_train, y_test)
    # srp_accuracies.append(srp_accuracy)
    
    # Johnson-Lindenstrauss
    # jl = GaussianRandomProjection(n_components=K)
    # X_jl = jl.fit_transform(X_BoW)
    # X_train_jl, X_test_jl, y_train, y_test = train_test_split(X_jl, y, test_size=0.3, random_state=42)
    # jl_accuracy = train_and_evaluate_model_sgd(X_train_jl, X_test_jl, y_train, y_test)
    # jl_accuracies.append(jl_accuracy)
    
    # t-SNE
    # tsne = TSNE(n_components=K)
    # X_tsne = tsne.fit_transform(X_BoW)
    # X_train_tsne, X_test_tsne, y_train, y_test = train_test_split(X_tsne, y, test_size=0.3, random_state=42)
    # tsne_accuracy = train_and_evaluate_model_sgd(X_train_tsne, X_test_tsne, y_train, y_test)
    # tsne_accuracies.append(tsne_accuracy)
    
    # Autoencoder
    # Not implemented
    
    # print(f"K: {K}, "
    #         f"Accuracy (Feature Hashing): {fh_accuracy:.4f}, "
    #         f"Multi Feature Hashing: {mfh_accuracy:.4f}, "
    #         # f"Cuckoo Feature Hashing: {cfh_accuracy:.4f}"
    #         # f"Accuracy (PCA): {pca_accuracy:.4f}, "
    #         # f"Accuracy (Sparse Random Projection): {srp_accuracy:.4f}, "
    #         # f"Accuracy (Johnson-Lindenstrauss): {jl_accuracy:.4f}, "
    #         # f"Accuracy (t-SNE): {tsne_accuracy:.4f}"
    #         )
    

# Plot accuracies vs K
# plt.plot(Ks, fh_accuracies, label='Feature Hashing')
# plt.plot(Ks, mfh_accuracies, label='Multi-Feature Hashing')
# plt.plot(Ks, cfh_accuracies, label='Cuckoo Feature Hashing')
# plt.plot(Ks, pca_accuracies, label='PCA')
# plt.plot(Ks, srp_accuracies, label='Sparse Random Projection')
# plt.plot(Ks, jl_accuracies, label='Johnson-Lindenstrauss')
# plt.plot(Ks, tsne_accuracies, label='t-SNE')
# Plot all Hs
for H in Hs:
    plt.plot(Ks, mfh_accuracies[:, H-1], label=f'Multi-Feature Hashing (H={H})')
plt.axhline(y=baseline_accuracy, color='r', linestyle='--', label='Baseline')
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

K: 64


NameError: name 'multi_feature_hashing' is not defined

Deprecated

In [None]:
def multi_feature_hashing(X_hasher_input, n_features, n_hashes):
    X_hashed = np.zeros((len(X_hasher_input), n_features))
    
    for i, row in enumerate(X_hasher_input):
        row_features, row_values = zip(*row)
        feature_indices = np.searchsorted(feature_names, row_features)
        
        hs = hashes[:n_hashes][:, feature_indices] % n_features
        v = np.array(row_values) * signs[feature_indices]
        
        for h in hs:
            np.add.at(X_hashed[i], h, v)

    return X_hashed

def cuckoo_feature_hashing(X_hasher_input, n_features, max_loop):
    X_hashed = np.zeros((len(X_hasher_input), n_features))
    buckets_1 = {f: (hashes_1[f] % n_features) for f in feature_names}
    buckets_2 = {f: (hashes_2[f] % n_features) for f in feature_names}
    for i, row in enumerate(X_hasher_input):
        for f, v in row.items():
            h1, h2, sign = buckets_1[f], buckets_2[f], feature_signs[f]
            value = sign * v
            assigned = False
            for _ in range(max_loop):
                if X_hashed[i, h1] == 0:  
                    X_hashed[i, h1] = value
                    assigned = True
                    break  
                else:
                    # Swap with existing value and move it to the alternative location
                    value, X_hashed[i, h1] = X_hashed[i, h1], value
                    h1, h2 = h2, (h1 + h2) % n_features  # Rehash to the next location
            if not assigned:
                X_hashed[i, h1] += value  
            
    return csr_matrix(X_hashed)