In [27]:
## import the libs ##
import os
import re
import pandas as pd 
import numpy as np
import sys, pathlib, fitz
import requests 

from nltk.corpus import stopwords
from tqdm import tqdm
from pandas import DataFrame
from loguru import logger


# import torch specs
import torch
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast
from torch.utils.data import Dataset
from torch import cuda
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix


# !pip install accelerate -U
# !pip install sentencepiece

In [28]:
# download pdf using url 
def download_pdf_from_url(file_url: str) -> None:
    
    # default vals
    save_path = "tuts_file"
    file_id = "temp"

    response = requests.get(file_url)   # get request
    if response.status_code == 200:
        # The content of the PDF file is in response.content
        # save the file in a pdf 
        full_path = os.path.join(save_path, file_id + ".pdf")

        with open(full_path, "wb") as pdf_file:
            pdf_file.write(response.content)
        print(f"PDF-ID {file_id} downloaded successfully.")
    else:
        print(f"Failed to download PDF-ID {file_id}. Status code: {response.status_code}")

    return full_path


# extract text function 
def extract_text_and_save(pdf_path):

    # extract text and save in .txt file
    with fitz.open(pdf_path) as doc:  # open document
        text = chr(12).join([page.get_text() for page in doc])
    logger.info(f"All text extracted, {pdf_path},  saving them....")

    # write as a binary file to support non-ASCII characters
    save_text_path = pdf_path.split(".")[0] + ".txt"
    pathlib.Path(save_text_path).write_bytes(text.encode())

    return save_text_path


# clean text function 
def clean_text(text_file):
    if text_file.endswith('.txt'):

        # Read the content of each text file
        with open(text_file, 'r') as file:
            text_content = file.read()

        sw = stopwords.words('english')

        text_content = text_content.replace("\n", " ").replace("•", "").lower()
        text = re.sub(r"[^a-zA-Z?.!,¿]+", " ", text_content)  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        text = re.sub(r"http\S+", "", text)  # Removing URLs 
        html = re.compile(r'<.*?>') 
        text = html.sub(r'', text)  # Removing html tags
        
        punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_'
        for p in punctuations:
            text = text.replace(p, '')  # Removing punctuations
        text = [word.lower() for word in text.split() if word.lower() not in sw]
        text = " ".join(text)  # removing stopwords        
        return {"cleaned_text": text, "status": True}


## If you have to check by passing PDF, run the below cell

In [29]:
# loading via pdf 

pdf_path = r"tuts_file/P-14.pdf"

extracted_text_path = extract_text_and_save(pdf_path)
cleaned_text = clean_text(extracted_text_path)['cleaned_text']

print(cleaned_text)

[32m2023-12-02 17:43:25.638[0m | [1mINFO    [0m | [36m__main__[0m:[36mextract_text_and_save[0m:[36m29[0m - [1mAll text extracted, tuts_file/P-14.pdf,  saving them....[0m


copyright , everlight rights reserved release date dec issue dle rev wwweverlightcom lamp sugd features choice various viewing angles available tape reel reliable robust pb free product remain within rohs compliant version description series specially designed applications requiring higher brightness led lamps available different colors, intensities applications tv set monitor telephone computer ver release date approved datasheet lamp sugd copyright , everlight rights reserved release date dec issue dle rev wwweverlightcom device selection guide chip materials emitted color resin color ingan brilliant green green diffused absolute maximum ratings ta parameter symbol rating unit continuous forward current peak forward current duty khz ifp reverse voltage vr v power dissipation pd mw operating temperature topr storage temperature tstg soldering temperature tsol sec electro optical characteristics ta parameter symbol min typ max unit condition luminous intensity iv mcd viewing angle deg 

## If you have to check by passsing URL, run the below cell

In [30]:
url = "https://www.sstlighting.com/CatalogPages/SR50-RC-Undercabinet-Spread.pdf"

pdf_path = download_pdf_from_url(url)
extracted_text_path = extract_text_and_save(pdf_path)
cleaned_text = clean_text(extracted_text_path)['cleaned_text']

print(cleaned_text)



[32m2023-12-02 17:43:40.578[0m | [1mINFO    [0m | [36m__main__[0m:[36mextract_text_and_save[0m:[36m29[0m - [1mAll text extracted, tuts_file/temp.pdf,  saving them....[0m


PDF-ID temp downloaded successfully.
wall cove v dimming smart lighting compatible external driver wwwsstlightingcom quail crest place, lawrence, ks cabinet luminaire product name input power factor power consumption dimming material weight size mounting lifetime warranty optics operating range beam angle equivalence sunstar rc vdc w v dimming anodized extruded aluminum, glass lens lb kg diameter, length recessed mount ceiling, surface mount, mounted conduit , hrs ten year limited warranty milled aluminum primary optic, clear glass lens aluminum reflector, optional translucent diffuse c c traditional w halogen, recessed building material without rough kit damp location ip product led module size housing current housing color beam angle color temperature diffuser sr rc sr x module chips rc recessed sv silver k diffused x module chips bk black k k k cct tuning rgb rgb sr x rc sr x rc e product led module size input current power consumed lumen output fixture efficacy sr x w lm lm w sr x 

## You can now load the fine-tuned model and pass the input into the tokenizer and check the results 
- make sure you have all the deps installed [ torch, cuda, torchvision, cuda etc]

- before you run this cell, make sure to download the weights and model file from gdrive [ Link -> https://drive.google.com/drive/folders/13pUf1mRJHguTQPud3cQeflce5eHKBciL?usp=sharing ]

- You can either load it via your local downloaded files from gdrive or You can load directly from HF Hub [ Recommended way ]
- Checkout the Space here - https://huggingface.co/luci007/LightingData-Bert-Finetuned/tree/main

In [31]:
## loading the model from HF hub ## 

device = torch.cuda.is_available()
print("Device --> ", device)
tokenizer_1 = BertTokenizer.from_pretrained('luci007/LightingData-Bert-Finetuned')
model_1 = BertForSequenceClassification.from_pretrained("luci007/LightingData-Bert-Finetuned").to(device)

# the text which you got extracted and cleaned from the above cell
text = cleaned_text

inputs = tokenizer_1(text, padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model_1(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
output_class = predictions.argmax().item()

print("Pred class from model -> ", output_class)



  return torch._C._cuda_getDeviceCount() > 0


Device -->  False


: 