In [2]:

import nltk
nltk.download('stopwords')
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
from transformers import AdamW
from torch.optim import lr_scheduler
from tqdm import tqdm
import re
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/guest1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cpu


In [4]:



REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    # Replace numbers with optional decimal part with a space
    text = re.sub(r'\d+(\.\d+)?', ' ', text)

    # Replace multiple newlines with a single newline
    text = re.sub(r'\n+', '\n', text)

    # Replace any non-word characters (except whitespace, periods, commas, and dashes) with a space
    text = re.sub(r'[^\w\s.,-]', ' ', text)

    # Remove spaces before punctuation marks (., -)
    text = re.sub(r'\s+([.,-])', r'\1', text)

    # Replace multiple spaces with a single space and trim leading/trailing spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Replace underscores and spaces with a single space
    text = re.sub(r'[_\s]+', ' ', text)

    # text = re.sub(r'[#+\*]', ' ', text)

    # Replace dashes and spaces with a single space
    text = re.sub(r'[-\s]+', ' ', text)

    text = re.sub(r'[\.\,]+', ' ', text )

    text = re.sub(r'\b\w{1,2}\b', ' ', text)

    text = text.lower() # lowercase text

    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.

    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.

    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text

    # Remove extra spaces that may be left behind
    cleaned_text = re.sub(r'\s+', ' ', text).strip()

    return cleaned_text



def chunk_text(document_text):

    l_total = []
    l_parcial = []
    if len(document_text.split())//150 >0:
        n = len(document_text.split())//150
    else:
        n = 1
    for w in range(n):
        if w == 0:
            l_parcial = document_text.split()[:200]
            l_total.append(" ".join(l_parcial))
        else:
            l_parcial = document_text.split()[w*150:w*150 + 200]
            l_total.append(" ".join(l_parcial))
    return l_total


In [5]:

# Load data
text_dir = 'data/train_texts'
texts = []
labels = []
ct = []

train_df = pd.read_csv('processed_train_data.csv', sep='|')
label_encoder = LabelEncoder()
train_df['target_label'] = label_encoder.fit_transform(train_df['target'])


# Iterate through files and prepare data
for file in os.listdir(text_dir):
    pdf_name = file.replace('.txt', '.pdf')
    if not train_df.loc[train_df['file_name'] == pdf_name].empty:
        label = train_df.loc[train_df['file_name'] == pdf_name, 'target_label'].values[0]
        with open(os.path.join(text_dir, file), 'r') as f:
            cleaned_text = clean_text(f.read())
            ct.append([file, cleaned_text])
            chunks = chunk_text(cleaned_text)
            # print(chunks)
            # print(len(chunks))
            for chunk in chunks:
                texts.append(chunk)
                labels.append(label)

print(len(texts))
print(len(labels))



3078
3078


In [15]:

# Get the classes (original labels)
classes = label_encoder.classes_
print(classes)
# Create label2id and id2label mappings
label2id = {label: idx for idx, label in enumerate(classes)}
id2label = {idx: label for idx, label in enumerate(classes)}

# Print the mappings
print("label2id:", label2id)
print("id2label:", id2label)


['cable' 'fuses' 'lighting' 'others']
label2id: {'cable': 0, 'fuses': 1, 'lighting': 2, 'others': 3}
id2label: {0: 'cable', 1: 'fuses', 2: 'lighting', 3: 'others'}


In [7]:
print(len(ct))
print(len(os.listdir('data/train_texts')))

919
1138


In [8]:
len(labels)

3078

In [9]:

print(labels[35])

3


In [10]:
for label,text in zip(labels[:3],texts[:3]):
  print(f'{label} - {text}')


1 - littelfuse inc specifications subject change without notice revised axial lead cartridge fuses time lag series series slo blo fuse indicating option agency approvals agency agency file number ampere range cartridge form anbk nbk nbk axial leaded form anbk nbk nbk afeatures electrical characteristics series available cartridge axial lead form wide range lead forming dimension packaging options accordance csa nmx standard rohs compliant lead free tripped fuse indicating option add suffix part number fuses available board washable additional sealing process add suffix part number sleeved fuse option available contact littelfuse additional information ampere rating opening time hours minimum hour maximum seconds minimum seconds maximumrohs pbps description littelfuse series slo blo fuses available size cartridge axial lead form offer tripped fuse indicating option offer features designed meet rigorous telecom industry requirements series product ordered tripped fuse indicating option s

In [11]:
from torch.utils.data import DataLoader, Dataset
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Define Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

        print(f"Dataset initialized with {len(self.texts)} texts and {len(self.labels)} labels.")

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # Debug print statements
        if idx >= len(self.texts):
            raise IndexError(f"Index {idx} is out of bounds for length {len(self.texts)}")

        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Convert the label to a tensor
        label_tensor = torch.tensor(label, dtype=torch.long)

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': label_tensor
        }

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

# Create Dataset and DataLoader
dataset = TextDataset(texts, labels, tokenizer, max_length=512)

# Access a single sample
sample = dataset[20]  # Accessing the first sample
input_ids = sample['input_ids']
attention_mask = sample['attention_mask']
curr_labels = sample['label']

# Print details about the single sample
# print(f"Input IDs: {input_ids}")
print(f"Shape of input IDs: {input_ids.shape}")
print(f"Decoded Input IDs: {tokenizer.decode(input_ids)}")

# print(f"Attention Mask: {attention_mask}")
print(f"Shape of Attention Mask: {attention_mask.shape}")

print(f"Label: {curr_labels}")
print(f"Shape of Label: {curr_labels.shape}")

# Create DataLoader
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Dataset initialized with 3078 texts and 3078 labels.
Shape of input IDs: torch.Size([512])
Decoded Input IDs: [CLS] options jumper cable optional series mjclc jumper cable mjclc jumper cable mjclc jumper cable mjclc inch jumper cable mjclc inch jumper cable carries line voltage power two units run larger spacing corner turn units required jumpers may plugged together create longer jumper every jumper jumper connection calculate foot fixture apply proper voltage drop female connector male connector female connector male connector telescopic cable female connector male connector female connector male connector telescopic cable female connector male connector female connector male connector telescopic cable female connector male connector female connector male connector telescopic cablemcv asymmetric symmetric mounting mcve extrusion mcv asymmetric symmetric mcfmc fixed mount bracket drawing detailspage mcv stwh stclr marklighting com serv acuity brands lighting inc rights reserved reserv

In [12]:

s = 'phoenixcontact'

for f, t in ct:
    if s in t:
        print(f)


others_397.txt
others_1033.txt
others_542.txt
others_1036.txt
others_280.txt
others_108.txt
others_730.txt
others_447.txt
others_319.txt
others_131.txt
others_938.txt
others_1058.txt
others_667.txt
others_275.txt
others_513.txt
others_499.txt
others_260.txt
others_248.txt
others_311.txt
others_1063.txt
others_475.txt
others_104.txt
others_996.txt
others_203.txt
others_777.txt
others_993.txt
others_32.txt
others_762.txt
others_601.txt


In [13]:
def train(model, dataloader, optimizer, scheduler, num_epochs):
    # Move the model to the device
    model.to(device)  # Add this line

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        # Print average loss for the epoch
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

    print("Training complete.")
    model_save_path = 'bert_model_checkpoint_512_max_len_text_chunked.pth'
    torch.save(model.state_dict(), model_save_path)
    print('checkpoint saved!')



In [14]:
from transformers import AdamW, get_linear_schedule_with_warmup


num_epochs = 1

optimizer = AdamW(model.parameters(), lr=3e-5)


# Scheduler with warm-up
total_steps = len(dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)


train(model, dataloader, optimizer, scheduler, num_epochs = 1)

Epoch 1/1: 100%|██████████| 193/193 [1:44:37<00:00, 32.52s/it]


Epoch 1/1 - Average Loss: 0.3035
Training complete.
checkpoint saved!
