In [13]:
from neucube import Reservoir
from neucube.encoder import Probability
from neucube.validation import Pipeline
from neucube.sampler import SpikeCount
import torch

from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from xgboost import XGBClassifier
from tqdm import tqdm

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import download
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.decomposition import TruncatedSVD

In [14]:
device = "cuda" if torch.cuda.is_available else "cpu"

In [15]:
# Initialize the Porter Stemmer
download('punkt')
stemmer = PorterStemmer()
stop_words = ["a", "an", "and", "are","as","at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not","of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]
def preprocess_and_stem(text):
    text = text.lower()
    tokens = word_tokenize(text)
    stems = [stemmer.stem(token) for token in tokens]
    preprocessed_text = ' '.join(stems)
    return preprocessed_text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aleks\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
vectorizer = TfidfVectorizer(strip_accents="ascii", lowercase=True, preprocessor=preprocess_and_stem, stop_words=stop_words)
cats = ['comp.graphics','sci.med']   #, 'rec.motorcycles']  #, 'talk.politics.guns', 'soc.religion.christian']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape



(1178, 19973)

In [17]:
n_components = 800  # Adjust the number of components as needed
svd = TruncatedSVD(n_components=n_components, random_state=123)
X = svd.fit_transform(vectors)

In [18]:
X = torch.FloatTensor(X)
X.shape

torch.Size([1178, 200])

In [19]:
encoder = Probability(iterations=500)
X = encoder.encode_dataset(X)
y = newsgroups_train.target
y = torch.FloatTensor(y)

In [20]:
X.to(device)
y.to(device)

tensor([0., 1., 0.,  ..., 1., 0., 0.], device='cuda:0')

In [None]:
kf = KFold(n_splits=4, shuffle=True, random_state=123)
y_total, pred_total = [],[]

for train_index, test_index in tqdm(kf.split(X)):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

  res = Reservoir(inputs=X.shape[2])
  sam =  SpikeCount()
  #clf = LogisticRegression(solver='liblinear')
  #clf = RandomForestClassifier()
  #clf = XGBClassifier()
  #clf = MultinomialNB()
  clf = SVC(kernel='linear')
  pipe = Pipeline(res, sam, clf)
  
  pipe.fit(X_train, y_train, train=False)
  pred = pipe.predict(X_test)

  y_total.extend(y_test)
  pred_total.extend(pred)

print(accuracy(y_total, pred_total))
print(confusion_matrix(y_total, pred_total))