In [44]:
import pandas as pd

import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [45]:
dataset = pd.read_csv('data/water_eletricity_data.csv')

texts = dataset['Description']
labels = dataset['Class']

texts_train, texts_test, labels_train, labels_test = train_test_split(
    texts, labels, test_size=0.3, random_state=42, stratify=labels
)

le = LabelEncoder()
labels_train_encoded = le.fit_transform(labels_train)
labels_test_encoded = le.transform(labels_test)

In [46]:
vectorizer = TfidfVectorizer(
    max_features=100,   # Keep only the 100 most important "words"/"biphrases"
    ngram_range=(1, 2), # Considers both unigrams and bigrams
    min_df=1    # unigrams/bigrams must appear at least once
)
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)

In [47]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
labels_train_encoded = le.fit_transform(labels_train)
labels_test_encoded = le.transform(labels_test)

model = xgb.XGBClassifier(
    n_jobs=-1,
    max_depth=3,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
model.fit(X_train, labels_train_encoded)

predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)

# Decode predictions back to original labels
predictions_original = le.inverse_transform(predictions)

accuracy = accuracy_score(labels_test, predictions_original)
print(f"Accuracy: {accuracy*100}%")

print("\nClassification Report:")
print(classification_report(labels_test_encoded, predictions, target_names=le.classes_))

test_set_dataframe = pd.DataFrame({
    'Description': texts_test,
    'Label': labels_test,
    'Prediction': predictions_original,
    'Confidence_electricity': probabilities[:, 0] * 100,
    'Confidence_other': probabilities[:, 1] * 100, 
    'Confidence_water': probabilities[:, 2] * 100,
})

test_set_dataframe['Correct'] = test_set_dataframe['Label'] == test_set_dataframe['Prediction']

test_set_dataframe

Accuracy: 99.59349593495935%

Classification Report:
              precision    recall  f1-score   support

 electricity       1.00      1.00      1.00        84
       other       0.99      1.00      0.99        78
       water       1.00      0.99      0.99        84

    accuracy                           1.00       246
   macro avg       1.00      1.00      1.00       246
weighted avg       1.00      1.00      1.00       246



Unnamed: 0,Description,Label,Prediction,Confidence_electricity,Confidence_other,Confidence_water,Correct
165,Grain dryer maintenance,other,other,1.359645,98.458244,0.182107,True
307,Electricity payment - Production site,electricity,electricity,99.661949,0.297730,0.040316,True
517,Water utility charges - Manufacturing,water,water,0.058759,0.107787,99.833458,True
586,Dental services,other,other,0.772789,99.123703,0.103505,True
116,Bank service charges,other,other,2.373619,97.308456,0.317917,True
...,...,...,...,...,...,...,...
790,Power bill - Production center,electricity,electricity,99.291321,0.636774,0.071913,True
387,Water supply - Office operations,water,water,0.043821,0.362913,99.593262,True
548,Monthly electricity payment - Office,electricity,electricity,99.661949,0.297730,0.040316,True
489,Energy bill - Processing facility,electricity,electricity,98.880333,1.018507,0.101156,True


In [48]:
actual_dataset = pd.read_excel('data/input_dataset_v.1.xlsx')

actual_dataset = actual_dataset[['Description']]

X_actual = vectorizer.transform(actual_dataset['Description'])

actual_dataset['predictions'] = model.predict(X_actual)
actual_dataset

Unnamed: 0,Description,predictions
0,"Invoice #45-A, Agricultural diesel",1
1,Electricity Bill - Offices,0
2,Payment for legal services,1
3,"Purchase Ammonium Nitrate 10,000tons",1
4,Electricity Bill - Irrigation Pumps,0
5,Tractor T-100 Maintenance,1
6,"Invoice #88-B, Diesel fuel",1
7,Office stationery purchase,1
