In [3]:
from zipfile import ZipFile
with ZipFile("archive (7).zip", 'r') as zip:
    zip.extractall()

In [4]:
import pandas as pd

# Load dataset
df = pd.read_csv("company-document-text.csv")

df

Unnamed: 0,text,label,word_count
0,order id 10718 shipping details ship name k...,ShippingOrder,120
1,invoice order id 10707 customer id arout ord...,invoice,66
2,order id 10448 shipping details ship name r...,ShippingOrder,96
3,invoice order id 11068 customer id queen ord...,invoice,68
4,order id 10656 shipping details ship name g...,ShippingOrder,109
...,...,...,...
2671,order id 10326 shipping details ship name b...,ShippingOrder,111
2672,purchase orders order id order date customer n...,purchase Order,39
2673,invoice order id 10460 customer id folko ord...,invoice,59
2674,stock report for 2018-01 category meat poult...,report,46


In [5]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

# اعمل نسخة list/set من كلمات ال stopwords
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = re.sub("[^a-zA-Z]", " ", text)  # إزالة الرموز
    text = " ".join(
        [stemmer.stem(word) for word in text.split() if word.lower() not in stop_words]
    )
    return text

In [6]:
def clean_text(text):
    text = re.sub("[^a-zA-Z]", " ", text)  # إزالة الرموز
    text = " ".join(
        [stemmer.stem(word) for word in text.split() if word.lower() not in stop_words]
    )
    return text

In [7]:
df['clean_text']= df['text'].apply(clean_text)

In [8]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])


In [9]:
from datasets import Dataset
dataset = Dataset.from_pandas(df[['clean_text', 'label']])

In [10]:
dataset

Dataset({
    features: ['clean_text', 'label'],
    num_rows: 2676
})

In [11]:
split_dataset=dataset.train_test_split(test_size=0.2,seed=42)
train_texts=split_dataset['train']['clean_text']
test_texts=split_dataset['test']['clean_text']

In [12]:
from transformers import AutoTokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [13]:
def tokenize_function(examples):
    return tokenizer(examples["clean_text"], padding="max_length", truncation=True,max_length=128)

In [14]:
train_dataset = dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 2676/2676 [00:00<00:00, 11149.98 examples/s]


In [15]:
test_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 2676/2676 [00:00<00:00, 13021.87 examples/s]


In [16]:
# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [17]:
train_dataset[0]

{'label': tensor(0),
 'input_ids': tensor([  101,  2344,  8909,  2911,  6987,  2911,  2171,  1047,  9152, 25394,
          2818, 29032,  2911,  4769,  5003, 12083,  9050, 16344,  2911, 25022,
          3775, 16426,  2911,  2555,  2530,  9944,  2361,  2911, 10690,  3642,
          2911,  4175,  3089,  2446,  2072,  7661,  6987,  7661,  8909, 12849,
          2368,  7661,  2171,  1047,  9152, 25394,  2818, 29032, 12666,  2063,
          6987, 12666,  2063,  2171, 16660,  6895,  4830,  6767, 12798,  2911,
          4842,  6987,  2911,  4842,  8909,  2911,  4842,  2171,  7349,  2121,
          2911,  2344,  6987,  2344,  3058,  2911,  3058,  4031,  4031, 10861,
          6499,  2158,  5403,  3995,  2474,  9220,  2050, 24110,  3775,  3775,
          3131,  3976,  2561,  4031,  6643,  2615, 24221, 24110,  3775,  3775,
          3131,  3976,  2561,  4031,  1999, 17802,  2094,  9033,  3363, 24110,
          3775,  3775,  3131,  3976,  2561,  4031, 16985,  2102,  8740, 10514,
         26775, 24

In [19]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load pre-trained BERT model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

# Define metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.003723,1.0,1.0,1.0,1.0
2,No log,0.00167,1.0,1.0,1.0,1.0
3,0.053300,0.00136,1.0,1.0,1.0,1.0


TrainOutput(global_step=504, training_loss=0.052934169314712995, metrics={'train_runtime': 208.1014, 'train_samples_per_second': 38.577, 'train_steps_per_second': 2.422, 'total_flos': 528073370652672.0, 'train_loss': 0.052934169314712995, 'epoch': 3.0})

In [26]:

model_save_path = "model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)



('model/tokenizer_config.json',
 'model/special_tokens_map.json',
 'model/vocab.txt',
 'model/added_tokens.json',
 'model/tokenizer.json')

In [30]:
import joblib
import os

folder = "label_enocder"
os.makedirs(folder, exist_ok=True) 

label_encoder_path = os.path.join(folder, "label_encoder.pkl")

# حفظ LabelEncoder
joblib.dump(label_encoder, label_encoder_path)
print("LabelEncoder saved successfully!")


LabelEncoder saved successfully!


In [36]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_name = "model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict(text):
    text = clean_text(text)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    with torch.no_grad():
        inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")
        inputs = {k: v.to(device) for k, v in inputs.items()}
        output = model(**inputs)
        prediction = torch.argmax(output.logits, dim=-1).item()
        return label_encoder.inverse_transform([prediction])[0]

In [37]:
input="invoice order id  10707 customer id  arout order date  2017-10-16 customer details  contact name  thomas hardy address  120 hanover sq. city  london postal code  wa1 1dp country  uk phone   171  555-7788 fax   171  555-6750 product details  product id product name quantity unit price 55 pâté chinois 21 24 0 57 ravioli angelo 40 19 5 70 outback lager 28 15 0 totalprice 1704 0 page 1"
print(predict(input))

invoice


In [38]:
# Example usage
input_text = "order id  10345 shipping details  ship name  quick-stop ship address  taucherstraße 10 ship city  cunewalde ship region  western europe ship postal code  1307 ship country  germany customer details  customer id  quick customer name  quick-stop employee details  employee name  andrew fuller shipper details  shipper id  2 shipper name  united package order details  order date  2016-11-04 shipped date  2016-11-11 products  -------------------------------------------------------------------------------------------------- product  northwoods cranberry sauce quantity  70 unit price  32 0 total  2240 0 -------------------------------------------------------------------------------------------------- product  teatime chocolate biscuits quantity  80 unit price  7 3 total  584 0 -------------------------------------------------------------------------------------------------- product  singaporean hokkien fried mee quantity  9 unit price  11 2 total  100 8 total price  total price  2924 8"
print(predict(input_text))

ShippingOrder
