# Hugging Face Model Evaluation Notebook

#### Evaluating uploaded Hugging face model

In [16]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import import_ipynb
from Utils import extract_chunk
import json
import os

# Load JSON file
with open('BERT_best_model_parameters.json', 'r') as file:
    best_hyperparameters = json.load(file)

CHUNK_SIZE = best_hyperparameters["CHUNK_SIZE"]
SENTENCE_ALIGNMENT = best_hyperparameters["SENTENCE_ALIGNMENT"]

CATEGORY_MAP = {
    'Philosophy',
    'Geography',
    'Social Sciences',
    'Politics'
}

# Load the trained model and tokenizer
model_path = './distilbert_text_classification_multiclass_512'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Move model to the correct device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode

# Define category mapping (same as during training)
category_to_int = {cat: idx for idx, cat in enumerate(CATEGORY_MAP)}
int_to_category = {v: k for k, v in category_to_int.items()}

# Function to predict category
def predict_category(text):
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move input to correct device

    with torch.no_grad():  # Disable gradient calculations for inference
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()  # Get predicted class index
    
    return int_to_category[predicted_class]

# Test with a sample text
# Load a test document from the Documents folder
DOCUMENTS_DIR = "Documents"  # Folder containing text documents
doc_id = "000115"  # Replace with an actual DocID from your dataset
doc_path = os.path.join(DOCUMENTS_DIR, f"{doc_id}.txt")

if os.path.exists(doc_path):
    with open(doc_path, "r", encoding="utf-8") as file:
        sample_text = file.read()
sample_text = extract_chunk(sample_text, CHUNK_SIZE, SENTENCE_ALIGNMENT)
predicted_label = predict_category(sample_text)
print(f"Predicted Category: {predicted_label}")


Predicted Category: Philosophy


#### Evaluating uploaded Hugging face model

In [17]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import import_ipynb
from Utils import extract_chunk
import json
import os

# Load JSON file
with open('BERT_best_model_parameters.json', 'r') as file:
    best_hyperparameters = json.load(file)

CHUNK_SIZE = best_hyperparameters["CHUNK_SIZE"]
SENTENCE_ALIGNMENT = best_hyperparameters["SENTENCE_ALIGNMENT"]

CATEGORY_MAP = {
    'Philosophy',
    'Geography',
    'Social Sciences',
    'Politics'
}

# Load the tokenizer and model from Hugging Face
model_name = 'gaurinm30/distilbert_text_classification_multiclass_512'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name)

# Move model to the correct device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set model to evaluation mode

# Define category mapping (same as during training)
category_to_int = {cat: idx for idx, cat in enumerate(CATEGORY_MAP)}
int_to_category = {v: k for k, v in category_to_int.items()}

# Function to predict category
def predict_category(text):
    inputs = tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move input to correct device

    with torch.no_grad():  # Disable gradient calculations for inference
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()  # Get predicted class index
    
    return int_to_category[predicted_class]

# Test with a sample text
# Load a test document from the Documents folder
DOCUMENTS_DIR = "Documents"  # Folder containing text documents
doc_id = "000115"  # Replace with an actual DocID from your dataset
doc_path = os.path.join(DOCUMENTS_DIR, f"{doc_id}.txt")

if os.path.exists(doc_path):
    with open(doc_path, "r", encoding="utf-8") as file:
        sample_text = file.read()
sample_text = extract_chunk(sample_text, CHUNK_SIZE, SENTENCE_ALIGNMENT)
predicted_label = predict_category(sample_text)
print(f"Predicted Category: {predicted_label}")


Predicted Category: Philosophy
