In [1]:
! pip install transformers
! pip install torch
! pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
import pandas as pd

import torch
import importlib  
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
import os
import numpy as np
import pandas as pd
import torch
import time

import warnings

# Suppress FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)
config={
    "dataload": {"batch_size": 12},
    "classes": {
        "Data": 0,
        "Documentation": 1,
        "Mission and Instruments": 2,
        "image": 3,
        "Software and tools": 4,
        "Training and Education": 5
    },
    "encoder": {
        "image_keyword": [
            "image",
            "images",
            "photojournal",
            "view",
            "Images",
            "Related"
        ],
        "software_keyword": [
            "software",
            "tools",
            "SDK",
            "API",
            "toolkit",
            "library",
            "application",
            "code",
            "class",
            "package",
            "interface",
            "annotation",
            "index",
            "tree",
            "constructor",
            "method",
            "version",
            "model",
            "attributes",
            "methods",
            "seed",
            "version",
            "command",
            "log",
            "undo",
            "fit",
            "method",
            "disk",
            "settings",
            "save",
            "programming",
            "object",
            "code",
            "script",
            "language",
            "scripting",
            "embedding",
            "function",
            "C++",
            "Fortran",
            "bug",
            "installation",
            "syntax",
            "interface",
            "architecture",
            "prompt",
            "parameter",
            "graphics",
            "comment",
            "wrapper",
            "terminal",
            "plot",
            "show",
            "save",
            "clear",
            "default",
            "input",
            "output",
            "query",
            "component",
            "add",
            "remove",
            "framework",
            "instruction",
            "pan",
            "view",
            "browse",
            "button",
            "coordinates",
            "position",
            "zoom",
            "dropdown",
            "search",
            "git",
            "github"
        ],
        "mission_keyword": [
            "mission",
            "instruments",
            "trajectory",
            "meter",
            "spacecraft",
            "project",
            "objectives",
            "measurement",
            "aircraft",
            "camera",
            "campaign",
            "flight",
            "radar",
            "test",
            "mobility",
            "curiosity",
            "suspension",
            "investigate"
        ],
        "training_keyword": ["training", "education"]
    },
    "model_parameters": {
        "model": "AutoModelForSequenceClassification",
        "tokenizer": "AutoTokenizer",
        "model_type": "Rajashreee/nasa-document-classifier",
        "module_name": "transformers",
        "num_labels": 6,
        "device": "cpu",
        "optimizer": "AdamW",
        "scheduler": "get_linear_schedule_with_warmup"
    },
}


class Encoder:
    def __init__(self, config, data):
        """
        Initializes an Encoder object.
        Args:
            data: The data to be encoded.
            config (dict): A dictionary containing configuration parameters for the encoder.
            - "encoder" (dict): A sub-dictionary containing encoder-related configuration.
                    - "image_keyword" (list): The list of keywords associated with image data.
                    - "software_keyword" (list): The list of keywords associated with software and tools data.
                    - "mission_keyword" (list): The list of keywords associated with mission and instruments data.
                    - "training_keyword" (list): The list of keywords associated with training and education data.

        """
        self.image_keyword = config["encoder"]["image_keyword"]
        self.software_keyword = config["encoder"]["software_keyword"]
        self.mission_keyword = config["encoder"]["mission_keyword"]
        self.training_keyword = config["encoder"]["training_keyword"]
        self.data = data
        self.encoded_data = pd.DataFrame()


    @classmethod
    def from_dict(cls, cfg: dict, data):
        """
        Creates an Encoder object from a dictionary and data.

        Args:
            cfg (dict): A dictionary containing configuration parameters for the encoder.
            data (dataFrame): A python dataframe with columns class, links and text response from the url

        Returns:
            Encoder: An instance of the Encoder class.

        """
        return cls(cfg, data)

    def generate_text_slice(self, word_positions, text_whole):
        """
        Generates a text slice based on the positions of specific words in the given text.

        Args:
            word_positions (dict): A dictionary containing the positions of
            specific words in the text.
            text_whole (str): The text which needs to be sliced
        Returns:
            str: The extracted text slice based on the positions of the words.

        """
        # Find the minimum position of the words
        min_position = min(min(positions) for positions in word_positions.values())
        # Calculate the start and end indices for the text slice
        start_index = max(min_position[0] - 50, 0)
        end_index = min(start_index + 1000, len(text_whole) - 1)
        # Extract the text slice based on the start and end indices
        text_slice = text_whole[start_index:end_index]
        return text_slice

    def extract_text(self, text):
        """
        Extracts a text slice based on the occurrence and positions of specific keywords.

        Args:
            text (str): The input text to extract the slice from.

        Returns:
            str: The extracted text slice based on the positions of the keywords.

        """

        keywords = (
            self.image_keyword
            + self.software_keyword
            + self.mission_keyword
            + self.training_keyword
        )
        software_count, mission_count, image_count, training_count = 0, 0, 0, 0
        word_positions = {}
        start = -1
        for word in text.split():
            start = text.find(word, start + 1)
            end = start + len(word)
            if word in keywords and word not in word_positions:
                word_positions[word] = []

            if word in self.image_keyword:
                image_count = image_count + 1
                word_positions[word].append((start, end))

            if word in self.software_keyword:
                software_count = software_count + 1
                word_positions[word].append((start, end))

            if word in self.mission_keyword:
                mission_count = mission_count + 1
                word_positions[word].append((start, end))

            if word in self.training_keyword:
                training_count = training_count + 1
                word_positions[word].append((start, end))

        if (
            software_count == 0
            and mission_count == 0
            and image_count == 0
            and training_count == 0
        ):
            mid = int(len(text) / 2)
            start_pos = mid - 512
            end_pos = mid + 512  # in terms of characters
            text_slice = text[start_pos:end_pos]
        else:
            text_slice = self.generate_text_slice(word_positions, text)
        return text_slice

    def encoder(self):
        """
        Encodes the data by processing the text, URLs, and classes.

        Returns:
            pandas.DataFrame: The encoded data with processed text, URLs, and classes.

        """
#         text_list, urls_list, class_list = [], [], []
        text_list, class_list = [], []
        for _, row in self.data.iterrows():
            text = row["text"]
            classes = row["class"]
            counter = text.split()
            if len(counter) <= 400:
                text_list.append(text)
                class_list.append(classes)
            elif len(counter) > 400:
                text = self.extract_text(text)
                text_list.append(text)
                class_list.append(classes)
        self.encoded_data["text"] = text_list
        self.encoded_data["class"] = class_list
        return self.encoded_data

    def text_encoder(self, text_blob):
        """
        Encodes the data by processing the text, URLs, and classes.
        input: text_blob of string
        Returns:
            str: encoded blob of string

        """
        counter = text_blob.split()
        if len(counter) <= 400:
            text = text_blob
        elif len(counter) > 400:
            text = self.extract_text(text_blob)
        return text




class DataLoad:
    def __init__(self, batch_size=10):
        """
        Initializes a new instance of the class.

        Args:
            config (dict): Configuration parameters for the class.
            batch_size (int): The batch size for data loading. Default is 8.

        Attributes:
            batch_size (int): The batch size for data loading.
            self.inference_dataset: The inferencing dataset
        """

        self.inference_dataset = None
        self.batch_size = batch_size

    @classmethod
    def from_dict(cls, cfg: dict):
        """
        Creates an DataLoad object from a dictionary

        Args:
            cfg (dict): A dictionary containing configuration parameters for the loader

        Returns:
            DataLoad: An instance of the DataLoad class.

        """
        return cls(batch_size=cfg.get("dataload", {}).get("batch_size"))

    def dataset(self, input_ids, attention_masks):
        """
        Converts the input_ids and attention_masks into tensor dataset

        Args:
            token_results (dictionary): a dictonary containing information about input_ids, attention_masks,
            links, and indices for training set, test set and validation set.
            input_ids (list): A list of tensors containing input_ids of inference dataset
            attention_masks (list): A list of tensors containing attention_masks of the inference dataset
        Returns:
            None
        """
        # Split the samples, and create TensorDatasets for each split.

        self.inference_dataset = TensorDataset(input_ids, attention_masks)

    def dataloader(self):
        """
        Create data loaders for inference

        Returns:
            inference_dataloader (DataLoader): Data loader for the inference samples.
        """

        inference_sampler = SequentialSampler(self.inference_dataset)
        inference_dataloader = DataLoader(
            self.inference_dataset,
            sampler=inference_sampler,
            batch_size=self.batch_size,
        )
        return inference_dataloader



class ModelBert:
    """
    A class for predicting the correct long form for a given acronym
    in a context paragraph using a pre-trained BERT model and tokenizer.

    """

    def __init__(self, config, num_labels=6, device="cpu"):
        self.config = config["model_parameters"]
        self.num_labels = num_labels
        self.device = torch.device(device)
        self.model = None
        self.tokenizer = None
        self.state_dict = None

    @classmethod
    def from_dict(cls, cfg: dict):
        """
        Creates an ModelBert object from a dictionary.

        Args:
            cfg (dict): A dictionary containing configuration parameters for the encoder.
            data: The data to be encoded.

        Returns:
            ModelBert: An instance of the ModelBert class.

        """
        model_parameters = cfg.get("model_parameters")
        return cls(
            cfg,
            num_labels=model_parameters.get("num_labels"),
            device=model_parameters.get("device"),
        )

    def make_model(self):
        """
        Instantiates a pre-trained xlnet model and tokenizer.
        Returns:
            A tuple containing the model and the tokenizer.
        """
        # Dynamicall import the transformers module
        module_name = self.config["module_name"]
        transformers = importlib.import_module(module_name)
        # Dynamically get the model class from transformers module
        model_class = getattr(transformers, self.config["model"])
        tokenizer_class = getattr(transformers, self.config["model"])
        # Load the tokenizer and model
        self.tokenizer = tokenizer_class.from_pretrained(self.config["model_type"])
        self.model = model_class.from_pretrained(
            self.config["model_type"], num_labels=self.config["num_labels"]
        ).to(self.device)
        return self.model


class TestPredictor:
    """
    Class created for sample predictions
    """

    def __init__(self, config):
        self.device = torch.device(config["model_parameters"]["device"])
        self.dataframe = pd.DataFrame()  # columns=['text','class']
        self.config = config
        self.classes = self.config["classes"]
        transformers = importlib.import_module(
            self.config["model_parameters"]["module_name"]
        )
        tokenizer_class = getattr(
            transformers, self.config["model_parameters"]["model"]
        )
        # Load the tokenizer and model
        self.tokenizer = tokenizer_class.from_pretrained(
            self.config["model_parameters"]["model_type"]
        )
        self.pdf_lists = []

    @classmethod
    def from_dict(cls, cfg: dict):
        """
        Creates an Encoder object from a dictionary and data.

        Args:
            cfg (dict): A dictionary containing configuration parameters for the encoder.

        Returns:
            Encoder: An instance of the Encoder class.

        """
        return cls(cfg)

    def convert_labels_to_class(self, value):
        """
        Converts a label value to its corresponding class/category.

        Parameters:
            value (int): The label value to be converted.

        Returns:
            str: The corresponding class/category for the given label value.

        """
        for category, val in self.classes.items():
            if val == value:
                return category

    def process_test_data(self, urls):
        """
        Processes the test data by retrieving content from the provided URL and encoding it.

        Parameters:
            url (str): The URL of the test data.

        Returns:
            Union[str, DataFrame]: If the content type is an image, returns "Image".
                                Otherwise, returns the encoded test data as a DataFrame.

        """
        self.dataframe["links"] = urls
        self.dataframe["class"] = [3 for i in urls]  # any random class
        processor = Preprocessor.from_dict(self.config, self.dataframe)
        (
            self.dataframe,
            self.pdf_lists,
            self.image_lists,
        ) = processor.preprocessed_features()
        self.dataframe["text"] = self.dataframe["soup"]
        encoder = Encoder.from_dict(self.config, self.dataframe)
        encoded_data = encoder.encoder()
        return encoded_data, self.pdf_lists, self.image_lists

    def tokenize_test_data(self, encoded_data):
        """
        Tokenizes the encoded test data using the tokenizer specified in the configuration.

        Parameters:
            encoded_data (DataFrame): The encoded test data.

        Returns:
            Tuple[Tensor, Tensor]: The input IDs and attention masks of the tokenized test data.

        """
        sentence, labels = [], []
        for _, row in encoded_data.iterrows():
            sentence.append(row["text"])
            labels.append(row["class"])
        module_name = self.config["model_parameters"]["module_name"]
        transformers = importlib.import_module(module_name)
        # Dynamically get the model class from transformers module
        tokenizer_class = getattr(
            transformers, self.config["model_parameters"]["tokenizer"]
        )
        tokenizer = tokenizer_class.from_pretrained(
            self.config["model_parameters"]["model_type"]
        )
        input_ids, attention_masks = [], []
        for sent in sentence:
            encoded_dict = tokenizer.encode_plus(
                text=sent,
                add_special_tokens=True,
                truncation=True,
                max_length=500,
                padding="max_length",
                return_attention_mask=True,
                return_tensors="pt",
            )
            input_ids.append(encoded_dict["input_ids"])
            attention_masks.append(encoded_dict["attention_mask"])
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)
        return input_ids, attention_masks

    def tokenize_text_data(self, text_blob):
        module_name = self.config["model_parameters"]["module_name"]
        transformers = importlib.import_module(module_name)
        # Dynamically get the model class from transformers module
        tokenizer_class = getattr(
            transformers, self.config["model_parameters"]["tokenizer"]
        )
        tokenizer = tokenizer_class.from_pretrained(
            self.config["model_parameters"]["model_type"]
        )
        inputs = tokenizer(text_blob, return_tensors="pt")
        return inputs["input_ids"], inputs["attention_mask"]

    def predict_test_data(self, inference_dataloader, loaded_model):
        """
        Predicts the category of the inference data given inference dataloader
        Parameters:
        inference_dataloader (Dataloader): Dataloader of inference data
        Return(s): category: A list of category where each url belongs to
        """
        loaded_model.eval()
        predictions = []
        for batch in inference_dataloader:
            batch = tuple(t.to(self.device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask = batch
            with torch.no_grad():
                # Forward pass, calculate logit predictions.
                result = loaded_model(
                    b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_input_mask,
                    return_dict=True,
                )
            logits = result.logits
            logits = torch.sigmoid(logits)
            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            # Store predictions and true labels
            predictions.extend(logits)

        # To get predictions for all urls
        preds_position = [np.argmax(arr).tolist() for arr in predictions]
        # Respective categories for all the urls
        categories = [
            self.convert_labels_to_class(position) for position in preds_position
        ]
        return categories
    

def batch_predictions(text_lists):
    dataframe = pd.DataFrame()
    dataframe["class"] = [3 for i in text_lists]  # any random class
    dataframe["text"]=text_lists
    encoder = Encoder.from_dict(config, dataframe)
    encoded_data = encoder.encoder()
    if len(encoded_data) > 0:
        predictor = TestPredictor.from_dict(config)
        input_ids, attention_masks = predictor.tokenize_test_data(
            encoded_data
        )
        loader = DataLoad.from_dict(config)
        loader.dataset(input_ids, attention_masks)
        inference_dataloader = loader.dataloader()
        model=ModelBert.from_dict(config)
        loaded_model = model.make_model()
        prediction={}
        category = predictor.predict_test_data(inference_dataloader, loaded_model)
        for enum, each_category in enumerate(category):
            prediction[text_lists[enum]] = each_category
    return prediction

def single_prediction(text_blob:str):
    dataframe = pd.DataFrame()
    encoder = Encoder.from_dict(config, dataframe)
    encoded_data = encoder.text_encoder(text_blob)
    predictor = TestPredictor.from_dict(config)
    input_ids, attention_masks = predictor.tokenize_text_data(text_blob)
    model=ModelBert.from_dict(config)
    loaded_model= model.make_model()
    with torch.no_grad():
        outputs = loaded_model(input_ids=input_ids, attention_mask=attention_masks)
    predictions = np.argmax(outputs.logits)
    predictor = TestPredictor.from_dict(config)
    predictions=predictor.convert_labels_to_class(predictions)
    return predictions


In [7]:
result=batch_predictions(["This data talks about software and tools needed for developing a package for data visualization",
                          "Direct- and Remote-sensing Instruments Direct-sensing instruments, also called contact science instruments, register characteristics of phenomena in their immediate vicinity.The heavy ion counter that flew on Galileo was a direct sensing instrument. It registered the characteristics of ions in the spacecraft's vicinity that actually entered the instrument."])
print(result)

{'This data talks about software and tools needed for developing a package for data visualization': 'Documentation', "Direct- and Remote-sensing Instruments Direct-sensing instruments, also called contact science instruments, register characteristics of phenomena in their immediate vicinity.The heavy ion counter that flew on Galileo was a direct sensing instrument. It registered the characteristics of ions in the spacecraft's vicinity that actually entered the instrument.": 'Mission and Instruments'}


In [8]:
result=single_prediction("Most instruments only receive and process existing light, particles, or other phenomena, and they are said to be passive. Typical of this type would be an imaging instrument viewing a planet that is illuminated by Sunlight, or a magnetometer measuring existing magnetic fields.")
print(result)

Mission and Instruments
