In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm
from sklearn.metrics import confusion_matrix

In [2]:
X_train = pd.read_csv('./data/x_group_main_types_train.csv')
X_test = pd.read_csv('./data/x_group_main_types_test.csv')
X_validation = pd.read_csv('./data/x_group_main_types_validation.csv')

y_train = pd.read_csv('./data/y_group_main_types_train.csv')
y_test = pd.read_csv('./data/y_group_main_types_test.csv')
y_validation = pd.read_csv('./data/y_group_main_types_validation.csv')

top_features = [
    'oldbalanceOrg',
    'step',
    'std_amount',
    'std_oldbalanceOrg',
    'std_oldbalanceDest',
    'amount_is_equal_to_balance',
    'newbalanceDest',
    'debt'
]

In [3]:
# vary number of neighbours, weights and p (1, 2, other)

def generate_params_dict():
    for n_neighbors in range(1, 10):
        for p in [1, 2]:
            for weights in ['uniform', 'distance']:
                yield {
                    'p': p,
                    'weights': weights,
                    'n_neighbors': n_neighbors
                }


best_knn_model = None
best_model_params = None
best_accuracy = 0
accuracies = {}
for param_dict in tqdm(generate_params_dict(), total=2 * 2 * 9):
    knn_model = KNeighborsClassifier(**param_dict)
    knn_model.fit(X_train[top_features], y_train)
    y_pred = knn_model.predict(X_validation[top_features])
    accuracy = accuracy_score(y_validation, y_pred)
    accuracies[tuple(param_dict.values())] = accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_knn_model = knn_model
        best_model_params = param_dict
    print(f'{best_model_params=}, {best_accuracy=}')


In [4]:
best_knn_model

  return self._fit(X, y)


In [5]:
y_pred = best_knn_model.predict(X_test[top_features])
print(accuracy_score(y_pred, y_test))
confusion_matrix(y_test, y_pred)

0.9999694189602446


array([[554248,      8],
       [     9,   1635]])

In [6]:
from sklearn.inspection import permutation_importance
knn_importances = permutation_importance(
    best_knn_model,
    X=X_train[top_features],
    y=y_train
)['importances_mean']
importances = {
    column: importance for column, importance in zip(top_features, knn_importances)
}
importances

{'oldbalanceOrg': 0.00014840161204319192,
 'step': 1.5895012000743948e-05,
 'std_amount': 1.6328512328023237e-05,
 'std_oldbalanceOrg': 2.1241516037329156e-05,
 'std_oldbalanceDest': 1.3872010473359175e-05,
 'amount_is_equal_to_balance': 0.005620187243241359,
 'newbalanceDest': 1.3583010255158178e-05,
 'debt': 1.8496013964508505e-05}

In [17]:
# vary model choice??
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train[top_features], y_train)
y_pred = naive_bayes_model.predict(X_test[top_features])
accuracy = accuracy_score(y_test, y_pred)
accuracy

  y = column_or_1d(y, warn=True)


0.9991239431552438

In [19]:
confusion_matrix(y_test, y_pred)

array([[553774,    482],
       [     5,   1639]])

In [12]:
from sklearn.inspection import permutation_importance
importances = permutation_importance(
    naive_bayes_model,
    X=X_train[top_features],
    y=y_train
)['importances_mean']
importances = {
    column: importance for column, importance in zip(top_features, importances)
}
importances

{'oldbalanceOrg': 0.0005362399048611133,
 'step': 7.225005454847278e-06,
 'std_amount': 0.0005703419306081426,
 'std_oldbalanceOrg': 4.479503382004424e-06,
 'std_oldbalanceDest': 0.00022498666986492388,
 'amount_is_equal_to_balance': 0.004897253197426133,
 'newbalanceDest': 4.046003054729575e-05,
 'debt': -0.00011603358760536864}