# FinSort Training Notebook

This notebook demonstrates the training process for the FinSort transaction categorization model.

## Steps:
1. Load training data
2. Clean transactions
3. Vectorize text
4. Train model
5. Evaluate performance
6. Generate confusion matrix

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add parent directory to path to import finsort
sys.path.insert(0, os.path.join(os.path.dirname(os.getcwd())))
from finsort.cleaner import clean_transaction

print("Imports successful!")

## 1. Load Training Data

In [None]:
# Load training data
data_path = os.path.join('..', 'data', 'finsort_train.csv')
df = pd.read_csv(data_path)

print(f"Loaded {len(df)} training samples")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

## 2. Clean Transactions

In [None]:
# Clean transaction strings
df['cleaned'] = df['transaction'].apply(clean_transaction)

print("Sample cleaned transactions:")
print(df[['transaction', 'cleaned']].head(10))

## 3. Vectorize Text

In [None]:
# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned'], 
    df['tag'], 
    test_size=0.2, 
    random_state=42, 
    stratify=df['tag']
)

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit and transform training data
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

print(f"\nVectorized feature matrix shape: {X_train_vec.shape}")
print(f"Number of features: {X_train_vec.shape[1]}")

## 4. Train Model

In [None]:
# Train Logistic Regression model
model = LogisticRegression(max_iter=500, random_state=42)
model.fit(X_train_vec, y_train)

print("Model training complete!")
print(f"Model classes: {model.classes_}")

## 5. Generate Classification Report

In [None]:
# Make predictions
y_pred = model.predict(X_test_vec)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)

# Calculate Macro F1
macro_f1 = f1_score(y_test, y_pred, average='macro')
print(f"\nMacro F1 Score: {macro_f1:.4f}")

## 6. Generate and Save Confusion Matrix

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
labels = sorted(y_test.unique())

# Create confusion matrix visualization
plt.figure(figsize=(10, 8))
sns.heatmap(cm, 
            annot=True, 
            fmt='d', 
            cmap='Blues',
            xticklabels=labels,
            yticklabels=labels,
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix', fontsize=16, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()

# Save confusion matrix to reports directory
reports_dir = os.path.join('..', 'reports')
os.makedirs(reports_dir, exist_ok=True)
output_path = os.path.join(reports_dir, 'confusion_matrix.png')
plt.savefig(output_path, dpi=150, bbox_inches='tight')
print(f"Confusion matrix saved to: {output_path}")

plt.show()