In [7]:
import pandas as pd

import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [32]:
dataset = pd.read_csv('data/train_data.csv')

texts = dataset['Description']
labels = dataset['Electricity_bill']

texts_train, texts_test, labels_train, labels_test = train_test_split(
    texts, labels, test_size=0.3, random_state=42, stratify=labels
)

In [33]:
vectorizer = TfidfVectorizer(
    max_features=100,   # Keep only the 100 most important "words"/"biphrases"
    ngram_range=(1, 2), # Considers both unigrams and bigrams
    min_df=1    # unigrams/bigrams must appear at least once
)
X_train = vectorizer.fit_transform(texts_train)
X_test = vectorizer.transform(texts_test)

In [41]:
model = xgb.XGBClassifier(
    n_jobs=-1,
    max_depth=3,
    learning_rate=0.1,
    n_estimators=100,
    random_state=42
)
model.fit(X_train, labels_train)

predictions = model.predict(X_test)

probabilities = model.predict_proba(X_test)

accuracy = accuracy_score(labels_test, predictions)
print(f"Accuracy: {accuracy*100}%")
print("Classification Report:")
print(classification_report(labels_test, predictions))

test_set_dataframe = pd.DataFrame({
    'Description': texts_test,
    'Label': labels_test,
    'Prediction': predictions,
    'Confidence_Class_0': probabilities[:, 0]*100,  # Confidence for class 0
    'Confidence_Class_1': probabilities[:, 1]*100,  # Confidence for class 1
})
test_set_dataframe['Correct'] = test_set_dataframe['Label'] == test_set_dataframe['Prediction']
test_set_dataframe['Confidence'] = test_set_dataframe.apply(
    lambda row: row['Confidence_Class_1'] if row['Prediction'] == 1 else row['Confidence_Class_0'],
    axis=1
)

test_set_dataframe

Accuracy: 100.0%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       114
           1       1.00      1.00      1.00       120

    accuracy                           1.00       234
   macro avg       1.00      1.00      1.00       234
weighted avg       1.00      1.00      1.00       234



Unnamed: 0,Description,Label,Prediction,Confidence_Class_0,Confidence_Class_1,Correct,Confidence
200,Electric power bill - Main office,1,1,0.076950,99.923050,True,99.923050
34,Seed purchase - Corn hybrid B,0,0,98.302475,1.697530,True,98.302475
511,Microfilm services,0,0,98.763542,1.236459,True,98.763542
703,Stamping services,0,0,98.763542,1.236459,True,98.763542
668,Chemical analysis,0,0,98.302475,1.697530,True,98.302475
...,...,...,...,...,...,...,...
72,Fertilizer - Phosphate blend,0,0,98.302475,1.697530,True,98.302475
309,Export documentation,0,0,98.302475,1.697530,True,98.302475
456,Electricity Bill - Break room,1,1,0.711763,99.288239,True,99.288239
774,Monthly power bill - Production site,1,1,1.105237,98.894760,True,98.894760


In [23]:
actual_dataset = pd.read_excel('data/input_dataset_v.1.xlsx')

actual_dataset = actual_dataset[['Description']]

X_actual = vectorizer.transform(actual_dataset['Description'])

actual_dataset['predictions'] = model.predict(X_actual)
actual_dataset

Unnamed: 0,Description,predictions
0,"Invoice #45-A, Agricultural diesel",0
1,Electricity Bill - Offices,1
2,Payment for legal services,0
3,"Purchase Ammonium Nitrate 10,000tons",0
4,Electricity Bill - Irrigation Pumps,1
5,Tractor T-100 Maintenance,0
6,"Invoice #88-B, Diesel fuel",0
7,Office stationery purchase,0
