In [1]:
%load_ext cython
import os
import json
import numpy as np
import pandas as pd
from scipy.sparse import vstack, csr_matrix, lil_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.decomposition import PCA
from sklearn.random_projection import SparseRandomProjection
from sklearn.random_projection import GaussianRandomProjection
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import mmh3


In [2]:
def train_and_evaluate_model_sgd(X_train, X_test, y_train, y_test):
    sgd_clf = SGDClassifier(loss='log_loss')
    sgd_clf.fit(X_train, y_train)
    y_pred = sgd_clf.predict(X_test)
    return accuracy_score(y_test, y_pred)

def train_and_evaluate_model_knn(X_train, X_test, y_train, y_test):
    knn_clf = KNeighborsClassifier()
    knn_clf.fit(X_train, y_train)
    y_pred = knn_clf.predict(X_test)
    return accuracy_score(y_test, y_pred)

def train_and_evaluate_model_svm(X_train, X_test, y_train, y_test):
    svm_clf = LinearSVC()
    svm_clf.fit(X_train, y_train)
    y_pred = svm_clf.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [3]:
# Extract data (jsonl) in chunks to avoid MemoryError
chunk_size = 1000
chunks = []
for chunk in pd.read_json('News_Category_Dataset_v3.json', lines=True, chunksize=chunk_size):
	chunks.append(chunk)
dataset_df = pd.concat(chunks, ignore_index=True)
dataset_df['content'] = dataset_df[[c for c in dataset_df.columns if c != 'category']].apply(lambda x: ' '.join(x.astype(str)), axis=1)
X, y = dataset_df['content'], dataset_df['category']

# Build bag of words
vectorizer = TfidfVectorizer(stop_words='english')
X_BoW = vectorizer.fit_transform(X)
X_BoW = X_BoW.astype(np.float32)
feature_names = vectorizer.get_feature_names_out()
N, D = X_BoW.shape
print(f'Number of samples: {N}, number of features: {D}')

Number of samples: 209527, number of features: 318358


### SGD Classifier

In [4]:
# Train and evaluate model without hashing
X_train, X_test, y_train, y_test = train_test_split(X_BoW, y, test_size=0.3, random_state=42)
baseline_accuracy = train_and_evaluate_model_sgd(X_train, X_test, y_train, y_test)
print(f"Accuracy without dimensionality reduction ({X_BoW.shape[1]} features): {baseline_accuracy:.4f}")

Accuracy without dimensionality reduction (318358 features): 0.5088


In [5]:
def get_hasher_input():
    X_hasher_input = []
    for i in range(X_BoW.shape[0]):
        # Get non-zero indices and values for the row i
        row_indices = X_BoW[i].indices
        row_values = X_BoW[i].data

        # Map feature names to their values
        row_vals = [(feature_names[j], row_values[idx]) for idx, j in enumerate(row_indices)]
        
        X_hasher_input.append(row_vals)
    return X_hasher_input

X_hasher_input = get_hasher_input()
np.average([len(x) for x in X_hasher_input])

np.float64(29.25994263269173)

In [6]:
%%cython
import numpy as np
cimport numpy as np

ctypedef np.float64_t DTYPE_t
ctypedef np.int64_t ITYPE_t

def multi_feature_hashing_cy(
    list X_hasher_input,
    int n_features,
    np.ndarray[ITYPE_t, ndim=2] hashes,     # Shape: (n_hashes, n_features)
    np.ndarray[ITYPE_t, ndim=1] signs,      # Shape: (n_features,)
    list feature_names                      
):
    cdef:
        int i, j, k
        int n_samples = len(X_hasher_input)
        int n_hashes = hashes.shape[0]
        np.ndarray[DTYPE_t, ndim=2] X_hashed = np.zeros((n_samples, n_features), dtype=np.float64)
        DTYPE_t[:, :] X_hashed_view = X_hashed
        ITYPE_t[:, :] hashes_view = hashes
        ITYPE_t[:] signs_view = signs
        
        list row
        tuple feature_value
        object feature
        int feat_idx
        DTYPE_t value
        ITYPE_t hash_val
        
        dict feature_map = {name: idx for idx, name in enumerate(feature_names)}
    
    for i in range(n_samples):
        row = X_hasher_input[i]
        
        for j in range(len(row)):
            feature_value = row[j]
            feature = feature_value[0]
            feat_idx = feature_map[feature]

            value = feature_value[1] * signs_view[feat_idx]
            
            for k in range(n_hashes):
                hash_val = hashes_view[k, feat_idx] % n_features
                X_hashed_view[i, hash_val] += value
    
    return X_hashed

In [7]:
np.random.seed(0)
signs = np.array(np.random.choice([-1, 1], size=len(feature_names)), dtype=np.int64)
hashes = np.array([[mmh3.hash(f.encode('utf-8'), i, signed=False) for f in feature_names] for i in range(5)], dtype=np.int64)

def multi_feature_hashing_new(X_hasher_input, n_features, n_hashes):
    return multi_feature_hashing_cy(
        X_hasher_input,
        n_features,
        hashes[:n_hashes],
        signs,
        list(feature_names)
    )

In [None]:
Ks = [2**i for i in range(6, 18)] # Number of components s.t. K << D
Hs = range(1, 4) # Number of hashes for multi-feature hashing

mfh_accuracies = np.zeros((len(Ks), len(Hs)))

for K in Ks:
    print(f"K: {K}")
    # Feature Hashing
    # hasher = FeatureHasher(n_features=K, input_type='pair')
    # X_hashed = hasher.transform(X_hasher_input)
    # X_train_hashed, X_test_hashed, y_train, y_test = train_test_split(X_hashed, y, test_size=0.3, random_state=42)
    # fh_accuracy = train_and_evaluate_model_sgd(X_train_hashed, X_test_hashed, y_train, y_test)
    # fh_accuracies.append(fh_accuracy)
    
    # Multi-Feature Hashing
    for H in Hs:
        X_mfh = multi_feature_hashing_new(X_hasher_input, K, H)
        X_train_mfh, X_test_mfh, y_train, y_test = train_test_split(X_mfh, y, test_size=0.3, random_state=42)
        mfh_accuracy = train_and_evaluate_model_sgd(X_train_mfh, X_test_mfh, y_train, y_test)
        mfh_accuracies[Ks.index(K), Hs.index(H)] = mfh_accuracy
        print(f"H: {H}, Accuracy: {mfh_accuracy:.4f}")
    
    # Cuckoo Feature Hashing
    # X_cfh = cuckoo_feature_hashing(X_hasher_input, K, 10)
    # X_train_cfh, X_test_cfh, y_train, y_test = train_test_split(X_cfh, y, test_size=0.3, random_state=42)
    # cfh_accuracy = train_and_evaluate_model_sgd(X_train_cfh, X_test_cfh, y_train, y_test)
    # cfh_accuracies.append(cfh_accuracy)
    
    
    # PCA
    # pca = PCA(n_components=K)
    # X_pca = pca.fit_transform(X_BoW)
    # X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.3, random_state=42)
    # pca_accuracy = train_and_evaluate_model_sgd(X_train_pca, X_test_pca, y_train, y_test)
    # pca_accuracies.append(pca_accuracy)
    
    # Sparse Random Projection
    # srp = SparseRandomProjection(n_components=K)
    # X_srp = srp.fit_transform(X_BoW)
    # X_train_srp, X_test_srp, y_train, y_test = train_test_split(X_srp, y, test_size=0.3, random_state=42)
    # srp_accuracy = train_and_evaluate_model_sgd(X_train_srp, X_test_srp, y_train, y_test)
    # srp_accuracies.append(srp_accuracy)
    
    # Johnson-Lindenstrauss
    # jl = GaussianRandomProjection(n_components=K)
    # X_jl = jl.fit_transform(X_BoW)
    # X_train_jl, X_test_jl, y_train, y_test = train_test_split(X_jl, y, test_size=0.3, random_state=42)
    # jl_accuracy = train_and_evaluate_model_sgd(X_train_jl, X_test_jl, y_train, y_test)
    # jl_accuracies.append(jl_accuracy)
    
    # t-SNE
    # tsne = TSNE(n_components=K)
    # X_tsne = tsne.fit_transform(X_BoW)
    # X_train_tsne, X_test_tsne, y_train, y_test = train_test_split(X_tsne, y, test_size=0.3, random_state=42)
    # tsne_accuracy = train_and_evaluate_model_sgd(X_train_tsne, X_test_tsne, y_train, y_test)
    # tsne_accuracies.append(tsne_accuracy)
    
    # Autoencoder
    # Not implemented
    
    # print(f"K: {K}, "
    #         f"Accuracy (Feature Hashing): {fh_accuracy:.4f}, "
    #         f"Multi Feature Hashing: {mfh_accuracy:.4f}, "
    #         # f"Cuckoo Feature Hashing: {cfh_accuracy:.4f}"
    #         # f"Accuracy (PCA): {pca_accuracy:.4f}, "
    #         # f"Accuracy (Sparse Random Projection): {srp_accuracy:.4f}, "
    #         # f"Accuracy (Johnson-Lindenstrauss): {jl_accuracy:.4f}, "
    #         # f"Accuracy (t-SNE): {tsne_accuracy:.4f}"
    #         )
    

# Plot accuracies vs K
# plt.plot(Ks, fh_accuracies, label='Feature Hashing')
# plt.plot(Ks, mfh_accuracies, label='Multi-Feature Hashing')
# plt.plot(Ks, cfh_accuracies, label='Cuckoo Feature Hashing')
# plt.plot(Ks, pca_accuracies, label='PCA')
# plt.plot(Ks, srp_accuracies, label='Sparse Random Projection')
# plt.plot(Ks, jl_accuracies, label='Johnson-Lindenstrauss')
# plt.plot(Ks, tsne_accuracies, label='t-SNE')
# Plot all Hs
for H in Hs:
    plt.plot(Ks, mfh_accuracies[:, H-1], label=f'Multi-Feature Hashing (H={H})')
plt.axhline(y=baseline_accuracy, color='r', linestyle='--', label='Baseline')
plt.xlabel('K')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

K: 64
H: 1, Accuracy: 0.1951
H: 2, Accuracy: 0.2015
H: 3, Accuracy: 0.2019
K: 128
H: 1, Accuracy: 0.2329
H: 2, Accuracy: 0.2431
H: 3, Accuracy: 0.2497
K: 256
H: 1, Accuracy: 0.2909


Deprecated

In [None]:
def multi_feature_hashing(X_hasher_input, n_features, n_hashes):
    X_hashed = np.zeros((len(X_hasher_input), n_features))
    
    for i, row in enumerate(X_hasher_input):
        row_features, row_values = zip(*row)
        feature_indices = np.searchsorted(feature_names, row_features)
        
        hs = hashes[:n_hashes][:, feature_indices] % n_features
        v = np.array(row_values) * signs[feature_indices]
        
        for h in hs:
            np.add.at(X_hashed[i], h, v)

    return X_hashed

def cuckoo_feature_hashing(X_hasher_input, n_features, max_loop):
    X_hashed = np.zeros((len(X_hasher_input), n_features))
    buckets_1 = {f: (hashes_1[f] % n_features) for f in feature_names}
    buckets_2 = {f: (hashes_2[f] % n_features) for f in feature_names}
    for i, row in enumerate(X_hasher_input):
        for f, v in row.items():
            h1, h2, sign = buckets_1[f], buckets_2[f], feature_signs[f]
            value = sign * v
            assigned = False
            for _ in range(max_loop):
                if X_hashed[i, h1] == 0:  
                    X_hashed[i, h1] = value
                    assigned = True
                    break  
                else:
                    # Swap with existing value and move it to the alternative location
                    value, X_hashed[i, h1] = X_hashed[i, h1], value
                    h1, h2 = h2, (h1 + h2) % n_features  # Rehash to the next location
            if not assigned:
                X_hashed[i, h1] += value  
            
    return csr_matrix(X_hashed)