## 6261-ITAI-2277-Artificial Intel Resource
John Nguyen

# Phase 3 Assignment

# 1. Import Libs

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 2. Load Dataset

In [2]:
df = pd.read_csv("financial_transactions_clean.csv")
print("Data loaded:", df.shape)
df.head(3)

Data loaded: (1110, 11)


Unnamed: 0,Transaction ID,Date,Currency,Sender,Receiver,Amount,Fee,Type,Net_Amount,Week,Day
0,TX001,2023-05-01,Auric,John Goldsmith,Sarah Gilded,100.0,2.0,Purchase,102.0,18,Monday
1,TX002,2023-05-02,Electra,Olivia Current,Ethan Charge,500.0,10.0,Transfer,510.0,18,Tuesday
2,TX003,2023-05-03,Zentia,Ava Zenith,Leo Zenith,200.0,5.0,Payment,205.0,18,Wednesday


# 3. Create target
Creates a simple “overspend” label using your numeric column

In [3]:
# Create target (overspend flag) — simple rule for demo
# Here, 1 = overspend (Net_Amount > 200), 0 = normal
df['overspend'] = (df['Net_Amount'] > 200).astype(int)

# 4. Select features and target

In [4]:
X = df[['Amount', 'Fee', 'Type', 'Week', 'Day']]
y = df['overspend']

# 5. Split into train and test sets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Encode categorical columns and define model pipeline


In [9]:
# Handles text columns (Type, Day) with one-hot encoding
categorical = ['Type', 'Day']
numeric = ['Amount', 'Fee', 'Week']

preprocess = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), categorical)],
    remainder='passthrough'
)

model = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', LogisticRegression(max_iter=500))
])

# 7. Train model

In [10]:
model.fit(X_train, y_train)
print("✅ Model trained")

✅ Model trained


# 8. Evaluate model

In [11]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {acc:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 1.00

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        36
           1       1.00      1.00      1.00       186

    accuracy                           1.00       222
   macro avg       1.00      1.00      1.00       222
weighted avg       1.00      1.00      1.00       222


Confusion Matrix:
 [[ 36   0]
 [  0 186]]
