In [None]:
import torch
from datasets.car_evaluation import CarEvaluation
from binhd.embeddings import CategoricalEncoder
from module.record_encoder import RecordEncoder
from binhd.classifiers import BinHD
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import torchhd



In [101]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using {} device".format(device))

Using cpu device


In [102]:
dataset = CarEvaluation()
dimension = 10000
num_levels = 100
batch_size = 1000
low = 0
high = num_levels
oper = "bind"

In [103]:
from sklearn.calibration import LabelEncoder


X = dataset.features[dataset.categorical_features]
num_categories_per_feature = [X[col].nunique() for col in X.columns]

y = dataset.targets
le = LabelEncoder()
y_encoded = torch.tensor(le.fit_transform(y))

categorical_encoder = CategoricalEncoder(dimension)
X = categorical_encoder.fit_transform(X)

  y = column_or_1d(y, warn=True)


In [104]:
record_encoder = RecordEncoder(
    out_features=dimension,
    size=X.shape[1], 
    levels=num_levels,
    low=low,
    high=high
)

In [105]:
from module.record_encoder import NGramEncoder


ngram_encoder = NGramEncoder(
    out_features=dimension,
    levels=num_levels,
    low=low,
    high=high
)


In [106]:
y_encoded = torch.tensor(y_encoded).to(device)

print("Distribuição das classes:")
print(torch.bincount(y_encoded))

Distribuição das classes:
tensor([ 384,   69, 1210,   65])


  y_encoded = torch.tensor(y_encoded).to(device)


In [107]:
with torch.no_grad():
    print(X.dtypes)

    samples = torch.tensor(X.values).to(device)
    labels = torch.tensor(y_encoded).to(device)

    # CategoricalEncoder
    X_categorical_encoder = categorical_encoder(samples.clone())

    # RecordEncoder
    X_record_encoder = record_encoder(samples.clone())

    # NGramEncoder
    X_ngram_encoder = ngram_encoder(samples.clone(), oper=oper)


  labels = torch.tensor(y_encoded).to(device)


buying      int32
maint       int32
doors       int32
persons     int32
lug_boot    int32
safety      int32
dtype: object


In [108]:
model = BinHD(dimension, dataset.num_classes)

In [109]:
X_categorical_encoder_fit = torchhd.multibundle(X_categorical_encoder) 

X_train_categorical, X_test_categorical, y_train_categorical, y_test_categorical = train_test_split(X_categorical_encoder_fit, labels, test_size=0.3, random_state = 0) 
X_train_ngram, X_test_ngram, y_train_ngram, y_test_ngram = train_test_split(X_ngram_encoder, labels, test_size=0.3, random_state = 0) 
X_train_record, X_test_record, y_train_record, y_test_record = train_test_split(X_record_encoder, labels, test_size=0.3, random_state = 0) 


In [110]:
with torch.no_grad():

    model.fit(X_train_categorical,y_train_categorical)
    predictions = model.predict(X_test_categorical.to(torch.int8))  
    acc = accuracy_score(predictions, y_test_categorical)
    print("BinHD Categorical Encoder: Accuracy = ", acc)

    model.fit(X_train_record,y_train_record)
    predictions = model.predict(X_test_record.to(torch.int8))  
    acc = accuracy_score(predictions, y_test_record)
    print("BinHD Record Encoder: Accuracy = ", acc)

    model.fit(X_train_ngram,y_train_ngram)
    predictions = model.predict(X_test_ngram.to(torch.int8))  
    acc = accuracy_score(predictions, y_test_ngram)
    print("BinHD Ngram Encoder: Accuracy = ", acc)


BinHD Categorical Encoder: Accuracy =  0.4123314065510597
BinHD Record Encoder: Accuracy =  0.31021194605009633
BinHD Ngram Encoder: Accuracy =  0.26204238921001927
