## Functions to include
Create central node, worker nodes update

Preprocessing, training, testing, inference, update

## Models to implement
Random forest, neural network, dummy

# Imports and constants

In [35]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import conf

In [None]:
DATA_FILE_PATH = "~/Documents/Networked AI systems/data/PS_20174392719_1491204439457_log.csv"
#DATA_FILE_PATH = "~/Documents/Networked AI systems/data/data_subset.csv"

# Data reading and preprocessing

The column `nameOrig` contains 6353307 unique values and `nameDest` 2722362, so it is unfeasible to do one-hot encoding for them. It is also possible it would not provide us with any useful information. The most common values in `nameOrig` had only 3 occurrences and in `nameDest` they were around 100.

`isFlaggedFraud` seemed a bit confusing so it is also dropped here. The description was "flags illegal attempts to transfer more than 200.000 in a single transaction".

One-hot encoding is performed on the `type` column since it contains string values.

In [None]:
def getData():
    df = readData()
    df = dataPreprocessing(df)
    return df

def readData(data_file=DATA_FILE_PATH):
    return pd.read_csv(data_file)

def dataPreprocessing(df):
    df.drop(columns=["nameOrig", "nameDest", "isFlaggedFraud"], inplace=True)
    df = pd.get_dummies(df)
    df["isFraud"] = df["isFraud"].astype(bool)
    return df

def dataSplit(df):
    x = df[df.columns.drop(["isFraud"])]
    y = df["isFraud"]
    return train_test_split(x, y, test_size=0.25, random_state=2024)

def scaleData(data):
    scaler = StandardScaler()
    return scaler.fit_transform(data)

## Random forest

Finding the best parameters for the model. The configuration can be found in the file `conf.py`. **Currently not in use.**

In [40]:
# model_random_search = RandomizedSearchCV(
#                        estimator=RandomForestRegressor(), 
#                        param_distributions=conf.rf_search_grid,
#                        **conf.rand_search_cv_params
#                        )
# model_random_search.fit(X_train, y_train)
#
# model_best = model_random_search.best_estimator_

In [41]:
def createRfModel(X_train, y_train):
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    return model

  array.dtypes.apply(is_sparse).any()):


RandomForestRegressor()

## Neural network

In [59]:
def createNnModel(X_train_scaled, y_train):
    mlp = MLPClassifier(hidden_layer_sizes=(100, 50), activation="relu", solver="adam", max_iter=500, random_state=2024)
    mlp.fit(X_train_scaled, y_train)
    return mlp

MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=2024)

# Accuracies

In [47]:
def getAccuracy(model, predictors, correct_classes):
    predicted_classes = model.predict(predictors).astype(bool)
    return accuracy_score(correct_classes, predicted_classes)

Training accuracy score 0.9964976691991664
Test accuracy score 0.996200307420528


# Main control code

In [None]:
def main():
    df = getData()
    X_train, X_test, y_train, y_test = dataSplit(df)
    X_train_scaled = scaleData(X_train)
    X_test_scaled = scaleData(X_test)
    range_start = 0
    data_range = range(range_start, range_start + math.floor(len(X_train) / 5))
    rf_model = createRfModel(X_train[data_range], y_train[data_range])
    nn_model = createNnModel(X_train_scaled[data_range], y_train[data_range])
    print("Random forest training accuracy score", getAccuracy(rf_model, X_train, y_train))
    print("Random forest test accuracy score", getAccuracy(rf_model, X_test, y_test))
    print("Multilayer perceptron training accuracy score", getAccuracy(nn_model, X_train_scaled, y_train))
    print("Multilayer perceptron test accuracy score", getAccuracy(nn_model, X_test_scaled, y_test))

main()