In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (precision_recall_curve,
                             PrecisionRecallDisplay,
                             accuracy_score)
# from sklearn.preprocessing import OneHotEncoder

In [None]:
df = pd.read_csv('../csvs/fraudTrain.csv')
df

In [None]:
fraud_trans = df[df['is_fraud'] == 1]
non_fraud_trans = df[df['is_fraud'] == 0]

len_fraud = len(fraud_trans)
rand_non_fraud = non_fraud_trans.sample(n=len_fraud, random_state=42)

balanced_df = pd.concat([fraud_trans, rand_non_fraud])

balanced_df = balanced_df.sort_values('unix_time').reset_index(drop=True)
balanced_df

In [None]:
categories = balanced_df['category'].unique().tolist()
categories

In [None]:
balanced_df = pd.get_dummies(balanced_df, columns=['category'], drop_first=True)
balanced_df.columns


In [None]:
features = []
for x in balanced_df:
    if 'category' in x:
        balanced_df[x] = balanced_df[x].astype(int)
        features.append(x)

features.append('amt')

In [None]:

test_df = pd.read_csv('../csvs/fraudTest.csv')
test_df = pd.get_dummies(test_df, columns=['category'], drop_first=True)
# test_df['category_misc_net'] = test_df['category_misc_net'].astype(int)
# test_df['category_grocery_pos'] = test_df['category_grocery_pos'].astype(int)
# test_df['category_gas_transport'] = test_df['category_gas_transport'].astype(int)

for x in test_df:
    if 'category' in x:
        test_df[x] = test_df[x].astype(int)

In [None]:
X_train = balanced_df[features].values
y_train = balanced_df['is_fraud'].values
X_test = test_df[features].values

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

y_test = clf.predict(X_test)
y_test

In [None]:
display = PrecisionRecallDisplay.from_estimator(
    clf, X_test, y_test, name="Descision Tree", plot_chance_level=True, despine=True
)
_ = display.ax_.set_title("2-class Precision-Recall curve")
# implies a perfect model? which wth no

In [None]:
y_actual = test_df['is_fraud'].values
false_n = 0
false_p = 0
true_p = 0
true_n = 0

actual_1 = 0
actual_0 = 0
test_1 = 0
test_0 = 0

for x in range(len(y_actual)):
    val = y_actual[x]
    testval = y_test[x]

    if testval != val:
        if testval == 0 and val == 1:
            false_n += 1

            test_0 += 1
            actual_1 += 1
        elif testval == 1 and val == 0:
            false_p += 1

            test_1 += 1
            actual_0 += 1
    else:
        if testval == 1 and val == 1:
            true_p += 1

            test_1 += 1
            actual_1 += 1
        elif testval == 0 and val == 0:
            true_n += 1

            test_0 += 1
            actual_0 += 1


print('false positives:', false_p)
print('false negatives:', false_n)
print('overall false:', (false_n + false_p))

print('\ntrue positives:', true_p)
print('true negatives:', true_n)
print('overall true:', (true_n + true_p))
# many more false positives than false negatives
# many more true negatives than true positives

print('\nactual fraud/non-fraud ratio:', (actual_1 / actual_0))
print('predicted fraud/non-fraud ratio:', (test_1 / test_0))
# many more 1s in the test df

In [None]:
print('\nAcuraccy:')
print((true_p + true_n) / (len(y_actual)) * 100)
# wahey, 94% acuraccy so I guess it's amazing
print('Precision:')
print((true_p / (true_p + false_p)) * 100)
# terrible precision
print('Recall:')
print((true_p / (true_p + false_n)) * 100)
# very good recall

In [None]:
accuracy_score(y_true=y_actual, y_pred=y_test) * 100