In [1]:
import torch
import pandas as pd
import re
import spacy
from torchtext.data.utils import get_tokenizer
import torch.nn as nn
import torch.nn.functional as F
import random
import torch.optim as optim
import torchtext
import time
from tqdm import tqdm
from models import MLPFeatureExtractor

In [2]:
import site
import os
os.environ['SP_DIR'] = '/opt/conda/lib/python3.11/site-packages'

accuracies = []

# nlp library of Pytorch
from torchtext import data

import warnings as wrn
wrn.filterwarnings('ignore')


SEED = 2021

torch.manual_seed(SEED)
torch.backends.cuda.deterministic = True

In [3]:
# train_data = pd.read_csv('data/dbpedia/train.csv', header=None, usecols=[0,2])
# train_data.columns = ['label', 'text']
# valid_data = pd.read_csv('data/dbpedia/valid.csv', header=None, usecols=[0,2])
# valid_data.columns = ['label', 'text']
# test_data = pd.read_csv('data/dbpedia/test.csv', header=None, usecols=[0,2])
# test_data.columns = ['label', 'text']

In [4]:
# def clean_text(text):
#     return re.sub(r'[^A-Za-z0-9]+', ' ', str(text))

In [5]:
# train_data['text'] = train_data['text'].apply(clean_text)
# valid_data['text'] = valid_data['text'].apply(clean_text)
# test_data['text'] = test_data['text'].apply(clean_text)

# train_data.to_csv('data/dbpedia/train_clean.csv', index=False, header=False)
# valid_data.to_csv('data/dbpedia/valid_clean.csv', index=False, header=False)
# test_data.to_csv('data/dbpedia/test_clean.csv', index=False, header=False)

In [6]:
cleaned_train_file = 'data/ag_news/train_clean.csv'
cleaned_valid_file = 'data/ag_news/valid_clean.csv'
cleaned_test_file = 'data/ag_news/test_clean.csv'

In [7]:
spacy_en = spacy.load('en_core_web_sm')

MAX_LEN=80

def spacy_tokenizer(text):
    tokens=[tok.text for tok in spacy_en.tokenizer(text)]
    return tokens[:MAX_LEN]

LABEL = data.LabelField()
TEXT = data.Field(tokenize=spacy_tokenizer, batch_first=True, include_lengths=True)
fields = [("label", LABEL), ("text", TEXT)]

training_data = data.TabularDataset(path=cleaned_train_file, format="csv", fields=fields, skip_header=True)
validation_data = data.TabularDataset(path=cleaned_valid_file, format="csv", fields=fields, skip_header=True)
test_data = data.TabularDataset(path=cleaned_test_file, format="csv", fields=fields, skip_header=True)

print(vars(training_data.examples[0]))

train_data,valid_data = training_data, validation_data

TEXT.build_vocab(train_data, 
                 max_size=40000, 
                 min_freq=5)

LABEL.build_vocab(train_data)
# Count the number of instances per class
label_counts = {LABEL.vocab.itos[i]: LABEL.vocab.freqs[LABEL.vocab.itos[i]] for i in range(len(LABEL.vocab))}
print("Number of instances per class:", label_counts)


print("Size of text vocab:",len(TEXT.vocab))

print("Size of label vocab:",len(LABEL.vocab))

TEXT.vocab.freqs.most_common(10)

# Creating GPU variable
device = torch.device("cuda")
#device = torch.device('cuda')
print(f'Using device: {device}')


BATCH_SIZE=32
print("Batch size initialized")
# Count the number of instances per class
label_counts = {LABEL.vocab.itos[i]: LABEL.vocab.freqs[LABEL.vocab.itos[i]] for i in range(len(LABEL.vocab))}
print("Number of instances per class:", label_counts)

{'label': '4', 'text': ['Apple', 'yesterday', 'released', 'a', '12', '7', 'MB', 'security', 'update', 'that', 'consists', 'of', 'several', 'revised', 'components', 'including', 'Apache', 'AppKit', 'HIToolbox', 'Kerberos', 'Postfix', 'PSNormalizer', 'Safari', 'and', 'Terminal']}
Number of instances per class: {'2': 30000, '3': 30000, '4': 30000, '1': 29999}
Size of text vocab: 27723
Size of label vocab: 4
Using device: cuda
Batch size initialized
Number of instances per class: {'2': 30000, '3': 30000, '4': 30000, '1': 29999}


In [8]:
train_iterator,validation_iterator = data.BucketIterator.splits(
    (train_data,valid_data),
    batch_size = BATCH_SIZE,
    # Sort key is how to sort the samples
    sort_key = lambda x:len(x.text),
    sort_within_batch = True,
    device = device
)

test_iterator = data.BucketIterator(
    test_data,
    batch_size=BATCH_SIZE,
    sort_key=lambda x: len(x.text),
    sort_within_batch=True,
    device=device
)

In [9]:
# for i in range(10):  # Print 5 samples
#     text, label = train_dataset[i]  # Index directly
#     print(f"Sample {i + 1}:")
#     print(f"Text: {text}")
#     print(f"Label: {label}")
#     print("-" * 40)

In [10]:
def multi_class_accuracy(preds, y):
    _, predicted = torch.max(preds, 1)
    correct = (predicted == y).float()
    acc = correct.sum() / len(correct)
    return acc

def train(model,iterator,optimizer,criterion):
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    model.train()
    
    for batch in iterator:
        
        # cleaning the cache of optimizer
        optimizer.zero_grad()
        
        text,text_lengths = batch.text
        #print("Text Length:", text_lengths[0].item())
        global BATCH_SIZE
        #BATCH_SIZE=text_lengths[0].item()
        #print("Sent Length:", BATCH_SIZE)
        #print("Iterator Batch Size:", batch.batch_size)
        batch.batch_size=BATCH_SIZE
        #print("Iterator Batch Size:", batch.batch_size)
        iterator = data.BucketIterator(
            train_data,
            batch_size=BATCH_SIZE,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True,
            device=device
        )
        
        # forward propagation and squeezing
        predictions = model(text).squeeze()
        
        # computing loss / backward propagation
        loss = criterion(predictions, batch.label)
        #loss = criterion(predictions,batch.type)
        loss.backward()
        
        # accuracy
        acc = multi_class_accuracy(predictions,batch.label)
        
        # updating params
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    # It'll return the means of loss and accuracy
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model,iterator,criterion):
    
    epoch_loss = 0.0
    epoch_acc = 0.0
    
    # deactivate the dropouts
    model.eval()
    
    # Sets require_grad flat False
    with torch.no_grad():
        for batch in tqdm(iterator, desc="Evaluating"):
            text,text_lengths = batch.text
            
            predictions = model(text).squeeze()

            #print(predictions.shape)
              
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = multi_class_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [11]:
SIZE_OF_VOCAB = len(TEXT.vocab)
EMBEDDING_DIM = 100
NUM_HIDDEN_NODES = 100
NUM_OUTPUT_NODES = len(LABEL.vocab)
NUM_LAYERS = 1
BIDIRECTION = False
DROPOUT = 0.2
BIT_WIDTH = 4  # This is unused in the current model but can be integrated later
#class_priors = [0.25, 0.25, 0.25, 0.25]
# Initialize model
model = MLPFeatureExtractor(SIZE_OF_VOCAB, NUM_HIDDEN_NODES, NUM_OUTPUT_NODES)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [12]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [13]:
# Dynamically generate the filename
best_model_path = f"MLPFeatureExtractor_ag_news_spacy.pth"

# # Initialize variables to track the best loss
# best_valid_loss = float('inf')

# num_epochs=100
# # Training Loop
# for epoch in range(1,num_epochs+1):
    
#     print("======================================================")
#     print("Epoch: %d" %epoch)
#     print("======================================================")
    
#     start_time = time.time()
    
#     train_loss,train_acc = train(model,train_iterator,optimizer,criterion)
    
#     valid_loss,valid_acc = evaluate(model,validation_iterator,criterion)
    
#     end_time = time.time()
#     epoch_duration = end_time - start_time
#     # Showing statistics
#     print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
#     print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
#     print(f'\tTime taken for epoch: {epoch_duration:.2f} seconds\n')
#     print()


# torch.save(model.state_dict(), best_model_path)
# print(f"New best model saved at epoch {epoch+1} with validation loss {valid_loss:.4f}")
#Load the best model after training
print(f"Loading the best model from {best_model_path}")
model.load_state_dict(torch.load(best_model_path))

Loading the best model from MLPFeatureExtractor_ag_news_spacy.pth


<All keys matched successfully>

In [14]:
# test_loss, test_acc = evaluate(model,test_iterator,criterion)
# print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")

In [15]:
def extract_features(model, iterator, device):
    model.eval()
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for batch in iterator:
            text,text_lengths = batch.text
            labels = batch.label
            features = model(text)
            all_features.append(features.cpu())
            all_labels.append(labels.cpu())
    
    all_features = torch.cat(all_features, dim=0).numpy()
    all_labels = torch.cat(all_labels, dim=0).numpy()
    return all_features, all_labels

In [16]:
train_features, train_labels = extract_features(model, train_iterator, device)
valid_features, valid_labels = extract_features(model, validation_iterator, device)
test_features, test_labels = extract_features(model, test_iterator, device)

In [17]:
from sklearn.preprocessing import MinMaxScaler

# Normalize features
scaler = MinMaxScaler()
train_features = scaler.fit_transform(train_features)
valid_features = scaler.transform(valid_features)
test_features = scaler.transform(test_features)

In [18]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score

nb_model = MultinomialNB()
nb_model.fit(train_features, train_labels)

valid_preds = nb_model.predict(valid_features)
valid_accuracy = accuracy_score(valid_labels, valid_preds)
print(f"Validation Accuracy with Naive Bayes: {valid_accuracy:.4f}")

test_preds = nb_model.predict(test_features)
test_accuracy = accuracy_score(test_labels, test_preds)
print(f"Test Accuracy with Naive Bayes: {test_accuracy:.4f}")

Validation Accuracy with Naive Bayes: 0.9816
Test Accuracy with Naive Bayes: 0.8680


In [19]:
from brevitas.graph.quantize import preprocess_for_quantize
from ptq_common import quantize_model, apply_bias_correction, apply_act_equalization

In [20]:
pre_model = preprocess_for_quantize(
            model,
            equalize_iters=20,
            equalize_merge_bias=True,
            merge_bn=True,
            channel_splitting_ratio=0.0,
            channel_splitting_split_input=False)

In [21]:
dtype = getattr(torch, 'float')
print(device)
quant_model = quantize_model(
        pre_model.to(device),
        dtype=dtype,
        device=device,
        backend='layerwise',
        scale_factor_type='float_scale',
        bias_bit_width=32,
        weight_bit_width=BIT_WIDTH,
        weight_narrow_range=False,
        weight_param_method='stats',
        weight_quant_granularity='per_tensor',
        weight_quant_type='sym',
        layerwise_first_last_bit_width=BIT_WIDTH,
        act_bit_width=BIT_WIDTH,
        act_param_method='stats',
        act_quant_percentile=99.9,
        act_quant_type='sym',
        quant_format='int',
        layerwise_first_last_mantissa_bit_width=4,
        layerwise_first_last_exponent_bit_width=3,
        weight_mantissa_bit_width=4,
        weight_exponent_bit_width=3,
        act_mantissa_bit_width=4,
        act_exponent_bit_width=3).to(device) 

cuda


In [26]:
quant_model_path = f"MLPNBQuantResults_ag_news/ModelParameter_Disc_3bit_0.25_0.25_0.25_0.25.pth"
print(f"Loading the best model from {quant_model_path}")
quant_model.load_state_dict(torch.load(quant_model_path), strict=False)

Loading the best model from MLPNBQuantResults_ag_news/ModelParameter_Disc_3bit_0.25_0.25_0.25_0.25.pth


_IncompatibleKeys(missing_keys=[], unexpected_keys=['fc1.weight_orig', 'fc2.weight_orig'])

In [27]:
test_loss, test_acc = evaluate(quant_model,test_iterator,criterion)
print(f"Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")

Evaluating: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 238/238 [00:00<00:00, 249.17it/s]

Test Loss: 2.7274, Test Acc: 0.8673





In [28]:
test_features, test_labels = extract_features(quant_model, test_iterator, device)

In [29]:
test_features, test_labels = extract_features(quant_model, test_iterator, device)
test_preds = nb_model.predict(test_features)
final_test_accuracy = accuracy_score(test_labels, test_preds)
print(final_test_accuracy)

0.8605079615738913
