In [123]:
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [124]:
XGBOOST_TEST_DATASET_CSV = 'data/cash_flow_statements/training_dataset.csv'
CONFIDENCE_THRESHOLD = 0.75
FALLBACK_LABEL = "Other" 

In [125]:
# Load training dataset
dataset = pd.read_csv(XGBOOST_TEST_DATASET_CSV)
texts = dataset['Description']
labels = dataset['Class']

# Split data
texts_train, texts_test, labels_train, labels_test = train_test_split(
    texts, labels, test_size=0.3, random_state=42, stratify=labels
)

# Encode labels
le = LabelEncoder()
labels_train_encoded = le.fit_transform(labels_train)
labels_test_encoded = le.transform(labels_test)



In [126]:
vectorizer = TfidfVectorizer(
    max_features=100,   # Keep only the 100 most important "words"/"biphrases"
    ngram_range=(1, 2), # Considers both unigrams and bigrams
    min_df=1    # unigrams/bigrams must appear at least once
)
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)

In [127]:
model = xgb.XGBClassifier(
    n_jobs=-1,
    max_depth=3,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
model.fit(X_train, labels_train_encoded)


predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)

max_confidence = np.max(probabilities, axis=1)

predictions_with_fallback = np.where(
    max_confidence >= CONFIDENCE_THRESHOLD,
    predictions,
    -1
)

predictions_original = []
for pred in predictions_with_fallback:
    if pred == -1:
        predictions_original.append(FALLBACK_LABEL)
    else:
        predictions_original.append(le.inverse_transform([pred])[0])

predictions_original = np.array(predictions_original)

In [128]:
accuracy = accuracy_score(labels_test, predictions_original)
print(f"Accuracy: {accuracy*100}%")
print("\nClassification Report:")
print(classification_report(labels_test_encoded, predictions, target_names=le.classes_))


confidence_columns = {}
for idx, class_name in enumerate(le.classes_):
    confidence_columns[f'Confidence_{class_name}'] = probabilities[:, idx] * 100

test_set_dataframe = pd.DataFrame({
    'Description': texts_test,
    'Label': labels_test,
    'Prediction': predictions_original,
    'Max_Confidence': max_confidence * 100,
    **confidence_columns
})


test_set_dataframe['Correct'] = test_set_dataframe['Label'] == test_set_dataframe['Prediction']
test_set_dataframe['Is_Unknown'] = test_set_dataframe['Prediction'] == FALLBACK_LABEL


test_set_dataframe

Accuracy: 100.0%

Classification Report:
                precision    recall  f1-score   support

   Electricity       1.00      1.00      1.00       750
       Heating       1.00      1.00      1.00       750
Waste Disposal       1.00      1.00      1.00       750
         Water       1.00      1.00      1.00       750

      accuracy                           1.00      3000
     macro avg       1.00      1.00      1.00      3000
  weighted avg       1.00      1.00      1.00      3000



Unnamed: 0,Description,Label,Prediction,Max_Confidence,Confidence_Electricity,Confidence_Heating,Confidence_Waste Disposal,Confidence_Water,Correct,Is_Unknown
2533,Scrap Metal Hauling - Pallet Disposal Monthly Fee,Waste Disposal,Waste Disposal,99.807381,0.136786,0.045295,99.807381,0.010538,True,False
7800,Water Charges - Laboratory Annual Fee,Water,Water,99.946541,0.036715,0.013162,0.003579,99.946541,True,False
9122,Kwh Usage - Data Center (December),Electricity,Electricity,99.671814,99.671814,0.218123,0.059315,0.050749,True,False
9374,Grid Charge - Production Line 3 (Q3),Electricity,Electricity,99.664215,99.664215,0.223173,0.060688,0.051924,True,False
7233,Scrap Metal Hauling - Industrial Debris Contai...,Waste Disposal,Waste Disposal,99.807381,0.136786,0.045295,99.807381,0.010538,True,False
...,...,...,...,...,...,...,...,...,...,...
2676,Scrap Metal Hauling - Pallet Disposal Bi-Weekl...,Waste Disposal,Waste Disposal,99.807381,0.136786,0.045295,99.807381,0.010538,True,False
660,Natural Gas Supply - Warehouse Gas Meter Reading,Heating,Heating,99.825462,0.147809,99.825462,0.014408,0.012328,True,False
3842,Power Consumption - North Warehouse (Q3),Electricity,Electricity,99.639389,99.639389,0.239673,0.065175,0.055763,True,False
8196,Waste Disposal - Recyclable Materials Monthly Fee,Waste Disposal,Waste Disposal,99.944862,0.038239,0.013708,99.944862,0.003189,True,False


In [130]:
test = pd.read_excel('data/cash_flow_statements/test.xlsx')
test['Description'] = test['Description'].astype('str')
X_test = vectorizer.transform(test['Description'])

predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)

max_confidence = np.max(probabilities, axis=1).astype(float)

predictions_with_fallback = np.where(
    max_confidence < CONFIDENCE_THRESHOLD,
    -1,
    predictions
)

predictions_original = []
for pred in predictions_with_fallback:
    if pred == -1:
        predictions_original.append(FALLBACK_LABEL)
    else:
        predictions_original.append(le.inverse_transform([pred])[0])

test['Prediction'] = np.array(predictions_original)
test['Correct']=(test['Prediction']==test['Class'])
# test.loc[test['GL_Account']=='Sales of Products']['Amount_EUR'].sum()
test

Unnamed: 0,Transaction_ID,Date,Description,Vendor,GL_Account,Amount_EUR,Class,Prediction,Correct
0,TRX001,15/01/25,Herbicide Product Sales - Europe Region,Various Distributors,Sales of Products,289000,Other,Other,True
1,TRX002,15/01/25,Fungicide Product Sales - North America,Various Distributors,Sales of Products,350000,Other,Other,True
2,TRX003,15/01/25,Seeds Product Sales - Latin America,Various Distributors,Sales of Products,304000,Other,Other,True
3,TRX004,16/01/25,Electricity Supply - Basel Site,Swiss Grid AG,Utilities Expense,45000,Electricity,Electricity,True
4,TRX005,16/01/25,Natural Gas Supply - Manufacturing,Gazprom Energy,Utilities Expense,38000,Heating,Heating,True
...,...,...,...,...,...,...,...,...,...
143,TRX144,28/12/25,Employee Salaries - December,Payroll Department,Salaries Expense,310000,Other,Other,True
144,TRX145,30/12/25,Year-End Waste Disposal,Comprehensive Waste Services,Waste Disposal Expense,23000,Waste Disposal,Waste Disposal,True
145,TRX146,31/12/25,Royalty Income - Trait Licenses,Licensee Companies,Royalty Income,28000,Other,Other,True
146,TRX147,31/12/25,Professional Solutions Sales - Annual,Pest Management Distributors,Sales of Products,54000,Other,Other,True
