In [2]:
import json
db_path = "../dataset"

In [29]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger_eng")

stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


# Train

In [3]:
with open(f"{db_path}/train.json", "r") as fp:
    train = json.load(fp)

In [None]:
train['premise']['0'], train['hypothesis']['0'], train['label']['0']

('Pluto rotates once on its axis every 6.39 Earth days;',
 'Earth rotates on its axis once times in one day.',
 'neutral')

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer, LancasterStemmer
from nltk import pos_tag
import numpy as np

class Config:
    stem = False
    stemmer = PorterStemmer()
    lemmatize = False
    lemmatizer = WordNetLemmatizer()
    tokenize = True

    clean_text = True
    regex = r"[^a-z0-9\-\s]"

    pos = False
    pos_tagger = pos_tag

    lower = True

    stopwords = set(stopwords.words('english'))


class Embedder():
    def __init__(self, cfg: Config) -> None:
        self.cfg = cfg

        self.lemmatizer = cfg.lemmatizer
        self.stemmer = cfg.stemmer
        self.stop_words = cfg.stopwords


    def embed(self, sentence: str) -> list[str]:
        tokens = [sentence]
        if self.cfg.tokenize:
            tokens = nltk.word_tokenize(sentence)

        if self.cfg.lemmatize:
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens]

        if self.cfg.stem:
            tokens = [self.stemmer.stem(token) for token in tokens]

        if self.cfg.clean_text:
                
            tokens = [re.sub(self.cfg.regex, '', token.lower() if self.cfg.lower else token) for token in tokens]
            tokens = [token for token in tokens if token and token not in self.stop_words]

        if self.cfg.pos:
            tokens = self.cfg.pos_tagger(tokens)
        
        return tokens
    
    def embed_data(self, json):
        len = len(json['label'])
        print(len)


import torch
import torch.nn as nn
from torch.utils.data import Dataset,  DataLoader


class MedicalDataset(Dataset):
  def __init__(self, df, embed_model):
    self.df = df
    self.embed_model = embed_model

    self.max_length = df['clean_text'].str.len().max()

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    clean = self.df["clean_text"].iloc[index]
    embed = np.array([self.embed_model.wv[word] for word in clean if word in self.embed_model.wv])
    label = int(self.df["condition_label"].iloc[index]) - 1

    embed = torch.tensor(embed, dtype=torch.float32)
    label = torch.tensor(label, dtype=torch.long)

    return embed, label

        



embedder = Embedder(Config)
train['premise']['0'], embedder.embed(train['premise']['0'])

('Pluto rotates once on its axis every 6.39 Earth days;',
 ['pluto', 'rotates', 'axis', 'every', '639', 'earth', 'days'])

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset

class inferenceDataset(Dataset)