In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (precision_recall_curve,
                             PrecisionRecallDisplay,
                             accuracy_score)
# from sklearn.preprocessing import OneHotEncoder

In [None]:
df = pd.read_csv('../csvs/fraudTrain.csv')
df

In [None]:
fraud_trans = df[df['is_fraud'] == 1]
non_fraud_trans = df[df['is_fraud'] == 0]

len_fraud = len(fraud_trans)
rand_non_fraud = non_fraud_trans.sample(n=len_fraud, random_state=42)

balanced_df = pd.concat([fraud_trans, rand_non_fraud])

balanced_df = balanced_df.sort_values('unix_time').reset_index(drop=True)
balanced_df

In [None]:
categories = balanced_df['category'].unique().tolist()
categories

In [None]:
balanced_df = pd.get_dummies(balanced_df, columns=['category'], drop_first=True)
balanced_df.columns


In [None]:
balanced_df['category_misc_net'] = balanced_df['category_misc_net'].astype(int)
balanced_df['category_grocery_pos'] = balanced_df['category_grocery_pos'].astype(int)
balanced_df['category_gas_transport'] = balanced_df['category_gas_transport'].astype(int)

In [None]:

test_df = pd.read_csv('../csvs/fraudTest.csv')
test_df = pd.get_dummies(test_df, columns=['category'], drop_first=True)
test_df['category_misc_net'] = test_df['category_misc_net'].astype(int)
test_df['category_grocery_pos'] = test_df['category_grocery_pos'].astype(int)
test_df['category_gas_transport'] = test_df['category_gas_transport'].astype(int)

In [None]:
X_train = balanced_df[['amt', 'category_misc_net', 'category_grocery_pos', 'category_gas_transport']].values
y_train = balanced_df['is_fraud'].values
X_test = test_df[['amt', 'category_misc_net', 'category_grocery_pos', 'category_gas_transport']].values

In [None]:
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)

y_test = knn.predict(X_test)
y_test

In [None]:
display = PrecisionRecallDisplay.from_estimator(
    knn, X_test, y_test, name="kNN", plot_chance_level=True, despine=True
)
_ = display.ax_.set_title("2-class Precision-Recall curve")
# implies a perfect model? which wth no

In [None]:
y_actual = test_df['is_fraud'].values
wrong_0_1 = 0
wrong_1_0 = 0
right_1 = 0
right_0 = 0

actual_1 = 0
actual_0 = 0
test_1 = 0
test_0 = 0

for x in range(len(y_actual)):
    val = y_actual[x]
    testval = y_test[x]

    if testval != val:
        if testval == 0 and val == 1:
            wrong_0_1 += 1

            test_0 += 1
            actual_1 += 1
        elif testval == 1 and val == 0:
            wrong_1_0 += 1

            test_1 += 1
            actual_0 += 1
    else:
        if testval == 1 and val == 1:
            right_1 += 1

            test_1 += 1
            actual_1 += 1
        elif testval == 0 and val == 0:
            right_0 += 1

            test_0 += 1
            actual_0 += 1


print('false positives:', wrong_1_0)
print('false negatives:', wrong_0_1)
print('overall false:', (wrong_0_1 + wrong_1_0))

print('\ntrue positives:', right_1)
print('true negatives:', right_0)
print('overall true:', (right_0 + right_1))
# many more false positives than false negatives
# many more true negatives than true positives

print('\nactual fraud/non-fraud ratio:', (actual_1 / actual_0))
print('predicted fraud/non-fraud ratio:', (test_1 / test_0))
# many more 1s in the test df

print('\nAcuraccy:')
print((right_1 + right_0) / (len(y_actual)) * 100)
# wahey, 94% acuraccy so I guess it's amazing
# this is salt in the wound, we literally had this lesson in statistics yesterday

In [None]:
accuracy_score(y_true=y_actual, y_pred=y_test) * 100