In [475]:
import pandas as pd
import requests
import pickle
from datetime import datetime
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler,OneHotEncoder

# Model Training

Fetch all transactions to train the model from the backend

In [476]:
def fetch_transactions():
    url = "http://localhost:3000/api/past_transactions"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json() 
    else:
        print("Error fetching data from backend")
        return []

Convert Date string and drop any null values

In [477]:
data = fetch_transactions()
df = pd.DataFrame(data)
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d') if x != "NaN-NaN-NaN" else None)
df.dropna(inplace=True)

-Extract the hour,day,time,is weekend or not from the date <br>
-encode debit and credit <br>
-scale amount <br>
-encode categories

In [478]:
# Feature Engineering
df['hour'] = df['date'].dt.hour
df['day_of_week'] = df['date'].dt.dayofweek
df['time_of_month'] = df['date'].dt.day
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# df['Type'] = df['Type'].map({"Debit": 0, "Credit": 1})

# Normalize 'amount'
scaler = StandardScaler()
df['scaled_amount'] = scaler.fit_transform(df[['amount']])

categories=df["category"].unique()

encoder = OneHotEncoder(sparse=False)

# Reshape the category column into a 2D array as required by the encoder
category_reshaped = df['category'].values.reshape(-1, 1)

category_encoded = encoder.fit_transform(category_reshaped)

encoded_df = pd.DataFrame(category_encoded, columns=encoder.categories_[0])

df = pd.concat([df, encoded_df], axis=1).drop(columns=['category'])

# Select features for training
features = ['scaled_amount', 'hour', 'day_of_week', 'time_of_month', 'is_weekend']
features.extend(categories)
X = df[features]



Final data for training

In [479]:
X

Unnamed: 0,scaled_amount,hour,day_of_week,time_of_month,is_weekend,Income,Rent,Food,Shopping,Entertainment,Personal,Health,Travel,Miscellaneous
0,5.724611,0,5,1,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.004632,0,0,3,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.253469,0,1,4,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.223540,0,3,6,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-0.246628,0,4,7,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,-0.151711,0,0,25,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
138,-0.214989,0,1,26,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
139,0.046675,0,3,28,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
140,-0.250049,0,4,29,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Model Training

In [None]:
# Train Isolation Forest model
# got the most optimal hyperparameters after a series of trials and errors
model = IsolationForest(n_estimators=100, contamination=0.2, random_state=42)
model.fit(X)

In [481]:
#model,scaler and encoder saving for api calls

with open('anomaly_detection_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

print("Model trained using data from backend and saved")

Model trained using data from backend and saved


# Model Testing

Function to process the test data


In [482]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def getdata(filepath):
    test_df=pd.read_csv(filepath)

    # Feature Engineering on Test Data

    # Convert 'Date' to datetime
    test_df['Date'] = pd.to_datetime(test_df['Date'])

    # Extract features from 'Date'
    test_df['hour'] = test_df['Date'].dt.hour
    test_df['day_of_week'] = test_df['Date'].dt.dayofweek
    test_df['time_of_month'] = test_df['Date'].dt.day
    test_df['is_weekend'] = test_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

    # Map 'Type' to 0 (Debit) and 1 (Credit)
    # test_df['Type'] = test_df['Type'].map({"Debit": 0, "Credit": 1})

    # Normalize 'Amount' using the same scaler from training (assuming scaler is already fit during training)
    scaler = StandardScaler()
    test_df['scaled_amount'] = scaler.fit_transform(test_df[['Amount']])

    # One-hot encode 'Category'
    # Assuming the categories used during training are available

    encoder = OneHotEncoder(sparse=False, categories=[categories])

    # Reshape 'Category' and apply one-hot encoding
    category_reshaped = test_df['Category'].values.reshape(-1, 1)
    category_encoded = encoder.fit_transform(category_reshaped)

    # Convert encoded data into a DataFrame
    encoded_df = pd.DataFrame(category_encoded, columns=encoder.categories_[0])

    # Merge the encoded categories back into the test DataFrame and drop the original 'Category' column
    test_df = pd.concat([test_df, encoded_df], axis=1).drop(columns=['Category'])

    # Select features for prediction (matching the features used during training)
    features = ['scaled_amount', 'hour', 'day_of_week', 'time_of_month', 'is_weekend']
    features.extend(categories)

    X_test = test_df[features]
    y_true=test_df["y_true"]

    return X_test,y_true

    # Display the processed test data
    # print(X_test)
    # print(y_true)


Function to calculate the accuracy of given test data

In [483]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate(filepath):

    X_test,y_true=getdata(filepath)
    y_pred = model.predict(X_test)  # You might need to preprocess the features (e.g., one-hot encoding)

    # Convert the predicted labels from {-1, 1} to {0, 1} for comparison with y_true
    y_pred = [1 if label == -1 else 0 for label in y_pred]

    # Calculate accuracy, precision, recall, and F1 score
    accuracy = accuracy_score(y_true, y_pred)
    print("Accuracy:", accuracy)