In [8]:
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [9]:
XGBOOST_TEST_DATASET_CSV = 'data/cash_flow_statements/training_dataset_classified.csv'
CONFIDENCE_THRESHOLD = 0.75
FALLBACK_LABEL = "Other" 

In [None]:
# Load training dataset
dataset = pd.read_csv(XGBOOST_TEST_DATASET_CSV)
texts = dataset['Description']
labels = dataset['Class']

# Split data
texts_train, texts_test, labels_train, labels_test = train_test_split(
    texts, labels, test_size=0.3, stratify=labels
)

# Encode labels
le = LabelEncoder()
labels_train_encoded = le.fit_transform(labels_train)
labels_test_encoded = le.transform(labels_test)

In [11]:
vectorizer = TfidfVectorizer(
    max_features=100,   # Keep only the 100 most important "words"/"biphrases"
    ngram_range=(1, 2), # Considers both unigrams and bigrams
    min_df=1    # unigrams/bigrams must appear at least once
)
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)

In [12]:
model = xgb.XGBClassifier(
    n_jobs=-1,
    max_depth=3,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
model.fit(X_train, labels_train_encoded)


predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)

max_confidence = np.max(probabilities, axis=1)

predictions_with_fallback = np.where(
    max_confidence >= CONFIDENCE_THRESHOLD,
    predictions,
    -1
)

predictions_original = []
for pred in predictions_with_fallback:
    if pred == -1:
        predictions_original.append(FALLBACK_LABEL)
    else:
        predictions_original.append(le.inverse_transform([pred])[0])

predictions_original = np.array(predictions_original)

In [13]:
accuracy = accuracy_score(labels_test, predictions_original)
print(f"Accuracy: {accuracy*100}%")
print("\nClassification Report:")
print(classification_report(labels_test_encoded, predictions, target_names=le.classes_))


confidence_columns = {}
for idx, class_name in enumerate(le.classes_):
    confidence_columns[f'Confidence_{class_name}'] = probabilities[:, idx] * 100

test_set_dataframe = pd.DataFrame({
    'Description': texts_test,
    'Label': labels_test,
    'Prediction': predictions_original,
    'Max_Confidence': max_confidence * 100,
    **confidence_columns
})


test_set_dataframe['Correct'] = test_set_dataframe['Label'] == test_set_dataframe['Prediction']
test_set_dataframe['Is_Unknown'] = test_set_dataframe['Prediction'] == FALLBACK_LABEL


test_set_dataframe

Accuracy: 98.25242718446601%

Classification Report:
                precision    recall  f1-score   support

   Electricity       1.00      1.00      1.00       660
       Heating       0.99      1.00      0.99       660
         Other       0.96      0.96      0.96       450
Waste Disposal       0.99      1.00      0.99       660
         Water       1.00      0.98      0.99       660

      accuracy                           0.99      3090
     macro avg       0.99      0.99      0.99      3090
  weighted avg       0.99      0.99      0.99      3090



Unnamed: 0,Description,Label,Prediction,Max_Confidence,Confidence_Electricity,Confidence_Heating,Confidence_Other,Confidence_Waste Disposal,Confidence_Water,Correct,Is_Unknown
2344,Steam Generation Gas -for Manufacturing Plant,Heating,Heating,99.535492,0.051666,99.535492,0.402883,0.004971,0.004997,True,False
4927,Employee Salaries - Month End,Other,Other,95.767418,2.333370,1.448998,95.767418,0.224520,0.225691,True,True
690,Municipal Water Charge -in European Operations,Water,Water,99.878136,0.014062,0.008732,0.097715,0.001353,99.878136,True,False
3008,Monthly Water Usage -,Water,Water,99.694794,0.014036,0.008716,0.281099,0.001351,99.694794,True,False
1599,Municipal Water Charge -,Water,Water,99.590332,0.014022,0.008707,0.385588,0.001349,99.590332,True,False
...,...,...,...,...,...,...,...,...,...,...,...
9752,Gas Utility Bill -in Monthey Facility,Heating,Heating,99.918518,0.051865,99.918518,0.019611,0.004991,0.005017,True,False
9553,General Waste Collection -,Waste Disposal,Waste Disposal,99.600594,0.014309,0.008886,0.374834,99.600594,0.001384,True,False
171,Natural Gas Supply -for Year-End,Heating,Heating,99.926384,0.051869,99.926384,0.011738,0.004991,0.005017,True,False
4649,Chemical Waste Disposal -for South America,Waste Disposal,Waste Disposal,99.921570,0.014355,0.008914,0.053770,99.921570,0.001388,True,False


In [14]:
test = pd.read_excel('data/cash_flow_statements/test.xlsx')
test['Description'] = test['Description'].astype('str')
X_test = vectorizer.transform(test['Description'])

predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)

max_confidence = np.max(probabilities, axis=1).astype(float)

predictions_with_fallback = np.where(
    max_confidence < CONFIDENCE_THRESHOLD,
    -1,
    predictions
)

predictions_original = []
for pred in predictions_with_fallback:
    if pred == -1:
        predictions_original.append(FALLBACK_LABEL)
    else:
        predictions_original.append(le.inverse_transform([pred])[0])

test['Prediction'] = np.array(predictions_original)
test['Correct']=(test['Prediction']==test['Class'])
# test.loc[test['GL_Account']=='Sales of Products']['Amount_EUR'].sum()
test

Unnamed: 0,Transaction_ID,Date,Description,Vendor,GL_Account,Amount_EUR,Class,Prediction,Correct
0,TRX001,15/01/25,Herbicide Product Sales - Europe Region,Various Distributors,Sales of Products,289000,Other,Other,True
1,TRX002,15/01/25,Fungicide Product Sales - North America,Various Distributors,Sales of Products,350000,Other,Other,True
2,TRX003,15/01/25,Seeds Product Sales - Latin America,Various Distributors,Sales of Products,304000,Other,Other,True
3,TRX004,16/01/25,Electricity Supply - Basel Site,Swiss Grid AG,Utilities Expense,45000,Electricity,Electricity,True
4,TRX005,16/01/25,Natural Gas Supply - Manufacturing,Gazprom Energy,Utilities Expense,38000,Heating,Heating,True
...,...,...,...,...,...,...,...,...,...
143,TRX144,28/12/25,Employee Salaries - December,Payroll Department,Salaries Expense,310000,Other,Other,True
144,TRX145,30/12/25,Year-End Waste Disposal,Comprehensive Waste Services,Waste Disposal Expense,23000,Waste Disposal,Waste Disposal,True
145,TRX146,31/12/25,Royalty Income - Trait Licenses,Licensee Companies,Royalty Income,28000,Other,Other,True
146,TRX147,31/12/25,Professional Solutions Sales - Annual,Pest Management Distributors,Sales of Products,54000,Other,Other,True
