In [149]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import math

excelFile='C:\\Users\\jfeda\\OneDrive\\Desktop\\data.xlsx'
data=pd.read_excel(excelFile, sheet_name='train')


In [150]:
# Additional feature engineering (example: calculating age from date of birth)
data['age'] = ((data['transDate'] - data['dateOfBirth']) // 365).values.astype("float64")

In [151]:
# Convert transDate and dateOfBirth to datetime64[ns]
data["transDate"] = pd.to_datetime(data["transDate"]).values.astype("float64")
data["dateOfBirth"] = pd.to_datetime(data["dateOfBirth"]).values.astype("float64")
# Convert creditCardNum, zip, cityPop, and isFraud to int64
data['creditCardNum'] = data['creditCardNum'].astype(int)
data['zip'] = data['zip'].astype(int)
data['cityPop'] = data['cityPop'].astype(int)
data['isFraud'] = data['isFraud'].astype(int)
# Convert amount, latitude, longitude, merchLatitude, and merchLongitude to float64
data['amount'] = data['amount'].astype(float)
data['latitude'] = data['latitude'].astype(float)
data['longitude'] = data['longitude'].astype(float)
data['merchLatitude'] = data['merchLatitude'].astype(float)
data['merchLongitude'] = data['merchLongitude'].astype(float)



In [152]:
# X is the variables that predictions are created from
# Y is the prediction values
X = data[['transDate', 'creditCardNum', 'amount', 'zip', 'latitude', 'longitude', 'cityPop', 'merchLatitude', 'merchLongitude','dateOfBirth', 'age']]
y = data['isFraud']

In [153]:
# Setup training using the train_test_split funtion
# Change test size to see how well it does with less information 
# (test_size 0.4 means 40% of data is used to test accuracy and 60% to train)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [154]:
# Create model and train the model (Delete once the model is created and loaded through joblib file)
model = RandomForestClassifier(n_estimators=30, random_state=42)
model.fit(X_train_resampled, y_train_resampled)

In [155]:
from sklearn.model_selection import GridSearchCV

param_grid = { 'max_depth': [None, 1,30, 50], 'min_samples_split': [0.25, 2, 4], 'min_samples_leaf': [0.5,1, 2] } # Idadadnitialize GridSearchCV 
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=1, scoring='accuracy') # Fit GridSearchCV 
grid_search.fit(X_train, y_train) 
print("Best Parameters:", grid_search.best_params_) 
print("Best Score:", grid_search.best_score_) # Evaluate on the test set 
best_model = grid_search.best_estimator_ 
test_accuracy = best_model.score(X_test, y_test) 
print("Test Set Accuracy:", test_accuracy)

In [156]:
# # Used to create the model file
# joblib.dump(model, 'fraudDetector.joblib')

In [157]:
# # Load the created joblib file (Create joblib file using the training excel sheet then run the file on the test sheet)
# model=joblib.load(fraudDetector.joblib')

In [158]:
# Create the isFraud predictions using the model
y_pred = model.predict(X_test)

In [159]:
# Creating a classification report for testing how accurate the program is on the training file
# (This line is not needed once the test file is running because there is no isFraud values to check prediction accuracy)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     36100
           1       0.82      0.86      0.84       265

    accuracy                           1.00     36365
   macro avg       0.91      0.93      0.92     36365
weighted avg       1.00      1.00      1.00     36365



In [168]:
# # Code used to export tree/model
# text_representation = tree.export_text(model)
# print(text_representation)
# # Better code could be implemented here to export an actual model of our tree (Mainly to make the presentation better)


NameError: name 'X_true' is not defined

In [161]:
# # Fill dataset with the new isFraud value predictions 
# # (Code for test file as training uses most inputs for predictions and a few for checking accuracy so not same dimensions)
# data['isFraud']=y_pred
# # Display the final data
# data