In [1]:
from neucube import Reservoir
from neucube.encoder import Probability
from neucube.validation import Pipeline
from neucube.sampler import SpikeCount
import torch

from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import download
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


In [2]:
device = "cuda" if torch.cuda.is_available else "cpu"

In [3]:
# Initialize the Porter Stemmer
download('punkt')
stemmer = PorterStemmer()

def preprocess_and_stem(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stems = [stemmer.stem(token) for token in tokens]
    preprocessed_text = ' '.join(stems)
    return preprocessed_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aleks\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
vectorizer = TfidfVectorizer(strip_accents="ascii", lowercase=True, preprocessor=preprocess_and_stem)
cats = ['comp.graphics','sci.med']
newsgroups_train = fetch_20newsgroups(subset='test', categories=cats)
vectors = vectorizer.fit_transform(newsgroups_train.data)
X = torch.FloatTensor(vectors.toarray())

In [5]:
X.shape

torch.Size([785, 16998])

In [6]:
encoder = Probability()
X = encoder.encode_dataset(X)
y = newsgroups_train.target

In [7]:
X.to(device)

tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0., 

In [8]:
kf = KFold(n_splits=4, shuffle=True, random_state=123)
y_total, pred_total = [],[]

for train_index, test_index in tqdm(kf.split(X)):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

  res = Reservoir(inputs=X.shape[2])
  sam =  SpikeCount()
  clf = LogisticRegression(solver='liblinear')
  pipe = Pipeline(res, sam, clf)
  
  pipe.fit(X_train, y_train)
  pred = pipe.predict(X_test)

  y_total.extend(y_test)
  pred_total.extend(pred)

print(accuracy(y_total, pred_total))
print(confusion_matrix(y_total, pred_total))

4it [8:02:37, 7239.31s/it]

0.8878980891719745
[[343  46]
 [ 42 354]]



