# Imports

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import joblib
import os
from datetime import datetime

# Load training data

In [4]:
df = pd.read_csv("transaction_data_small.csv")

# Train-test split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    df['purpose'], df['category'], test_size=0.2, random_state=42
)

# Create and fit model

In [6]:
model = Pipeline([
    ('count_vectorizer', CountVectorizer()),
    ('clf', LogisticRegression(max_iter=1000))
])

model.fit(X_train, y_train)

# Evaluate accuracy

In [7]:
predictions = model.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
classification_rep = classification_report(y_test, predictions)

print(f'Accuracy: {accuracy:.2f}')
print('\nClassification Report:\n', classification_rep)

Accuracy: 0.54

Classification Report:
                                      precision    recall  f1-score   support

               Arznei- & Heilmittel       0.70      0.25      0.37        28
           Arztbesuch / Krankenhaus       0.24      0.44      0.31        18
                       Bankgebühren       0.86      0.73      0.79        26
                      Barauszahlung       0.61      0.48      0.54        23
                      Bareinzahlung       0.91      0.87      0.89        23
                          Bausparen       0.41      0.68      0.51        22
  Bekleidung / Schuhe / Accessoires       0.18      0.25      0.21        16
     Berufsunfähigkeitsversicherung       0.41      0.45      0.43        20
 Bücher / Zeitungen / Zeitschriften       0.33      0.21      0.26        14
                       Büromaterial       0.50      0.43      0.46        14
                           Drogerie       0.50      0.44      0.47        25
   Festgeld / Tagesgeld / Sparkonto

# Save model and evalutation

In [8]:
current_datetime = datetime.now().strftime("%d.%m.%Y_%H:%M:%S")

directory_path = f"model_{current_datetime}"

os.makedirs(directory_path)

joblib.dump(model, f'./{directory_path}/trained_model.joblib')

with open(f'./{directory_path}/model_evaluation.txt', 'w') as file:
    file.write(f'Accuracy: {accuracy:.2f}\n\n')
    file.write('Classification Report:\n')
    file.write(classification_rep)