# Random Forest Classifier for Project Proposal

## Import Necessary Libraries

In [49]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import plot_tree
from sklearn.feature_selection import SelectFromModel
import joblib
import random

### Read Data from .csv File

In [50]:
card_data = pd.read_csv("card_transdata.csv")

### Display All Data

In [51]:
card_data.describe(include="all")

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,26.628792,5.036519,1.824182,0.881536,0.350399,0.100608,0.650552,0.087403
std,65.390784,25.843093,2.799589,0.323157,0.477095,0.300809,0.476796,0.282425
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.878008,0.296671,0.475673,1.0,0.0,0.0,0.0,0.0
50%,9.96776,0.99865,0.997717,1.0,0.0,0.0,1.0,0.0
75%,25.743985,3.355748,2.09637,1.0,1.0,0.0,1.0,0.0
max,10632.723672,11851.104565,267.802942,1.0,1.0,1.0,1.0,1.0


### Drop All NaN / NA Values from Dataset

In [52]:
card_data.dropna()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.311140,1.945940,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
999995,2.207101,0.112651,1.626798,1.0,1.0,0.0,0.0,0.0
999996,19.872726,2.683904,2.778303,1.0,1.0,0.0,0.0,0.0
999997,2.914857,1.472687,0.218075,1.0,1.0,0.0,1.0,0.0
999998,4.258729,0.242023,0.475822,1.0,0.0,0.0,1.0,0.0


### Get Unique Values From the Dependent Vairable Column (Whether the Transction is Fraulent or Not)

In [53]:
print(np.unique(card_data['fraud']))

[0. 1.]


### Now that we know they are unique values of 0 - false or 1 - true

- Convert all floats to ints

In [54]:
card_data['fraud'] = card_data['fraud'].astype(int)

### Radomize the Samples to Drop

In [55]:
random.seed(42)
rows_to_drop = random.sample(range(len(card_data)), 750000)

In [56]:
trimmed_card_data = card_data.drop(rows_to_drop)

In [57]:
trimmed_card_data.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0,250000.0
mean,26.596648,5.031841,1.826702,0.88114,0.349708,0.100856,0.649292,0.086808
std,68.24695,33.69722,2.824662,0.323624,0.476879,0.301139,0.477193,0.281554
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.864177,0.29669,0.47876,1.0,0.0,0.0,0.0,0.0
50%,9.978866,0.998117,1.000741,1.0,0.0,0.0,1.0,0.0
75%,25.736275,3.338968,2.102473,1.0,1.0,0.0,1.0,0.0
max,10632.723672,11851.104565,267.802942,1.0,1.0,1.0,1.0,1.0


In [58]:
print(trimmed_card_data['fraud'].value_counts())

fraud
0    228298
1     21702
Name: count, dtype: int64


## Even out the Amount of Fraudulent and Non-Fraudulent Transactions to Solve Class Imbalance (The Following 3 Cells are Commented out when Using Stratified Sampling)

In [59]:
minimum_samples = min(trimmed_card_data['fraud'].value_counts())

In [60]:
print(minimum_samples)

21702


In [61]:
fraud_samples = trimmed_card_data[trimmed_card_data['fraud'] == 1].sample(minimum_samples,random_state=42)
non_fraud_samples = trimmed_card_data[trimmed_card_data['fraud'] == 0].sample(minimum_samples,random_state=42)    

### Confirm the Data Set is Even

In [62]:
print(len(fraud_samples))
print(len(non_fraud_samples))

21702
21702


### Combine the Datasets

In [63]:
balanced_card_data = pd.concat([fraud_samples,non_fraud_samples])

### Create the Trainning, Testing, and Validation Sets

Choose any 'X' Independent Variable we want to use for the prediction

In [64]:
# Replace 'trimmed_card_data' with 'balanced_card_data' when undersmapling
X = balanced_card_data[['distance_from_home','distance_from_last_transaction','ratio_to_median_purchase_price','repeat_retailer','used_chip','used_pin_number','online_order']]
y = balanced_card_data['fraud']

In [65]:
# Remove Strtify when using a even (balanced) amount of samples
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp,y_temp, test_size=0.5, random_state=42)

### Create the Random Forest Classifier

In [66]:
fraud_check_classifier = RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=42)

## Use Grid Search For Hyper-Parameter Tuning 

In [67]:
'''# Can uncomment top section to take, Grid Search takes awhile
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [4, 8, None],
    'max_features': [2, 3, 4],
    'max_leaf_nodes':[4, 6, None],
    'min_samples_leaf':[1, 2, 4], 
    'min_samples_split':[5, 10]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=fraud_check_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score achieved during the grid search
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)'''

'# Can uncomment top section to take, Grid Search takes awhile\nparam_grid = {\n    \'n_estimators\': [50, 100, 200, 300],\n    \'max_depth\': [4, 8, None],\n    \'max_features\': [2, 3, 4],\n    \'max_leaf_nodes\':[4, 6, None],\n    \'min_samples_leaf\':[1, 2, 4], \n    \'min_samples_split\':[5, 10]\n}\n\n# Initialize the GridSearchCV object\ngrid_search = GridSearchCV(estimator=fraud_check_classifier, param_grid=param_grid, cv=5, scoring=\'accuracy\', n_jobs=-1, verbose=3)\n\n# Fit the grid search to the data\ngrid_search.fit(X_train, y_train)\n\n# Print the best parameters and the best score achieved during the grid search\nprint("Best parameters:", grid_search.best_params_)\nprint("Best cross-validation score:", grid_search.best_score_)'

### Fit the Model on the Trainning Set

#### Predict the Results

In [68]:
fraud_check_classifier.fit(X_train,y_train)

### Re-fine through Feature Selection

In [69]:
fraud_check_tree = fraud_check_classifier.estimators_[0]
feature_names = X_train.columns.tolist()
class_names = [str(cls) for cls in sorted(y_train.unique())]
plt.figure(figsize=(40,10), dpi=600)
plot_tree(fraud_check_tree, filled=True, feature_names=feature_names, class_names=class_names, rounded=True)
plt.title('Initial Random Forest Classifier')
plt.savefig("Initial_Random_Forest_Balanced.svg", format='svg', bbox_inches='tight')
plt.close()

In [70]:
feature_model = SelectFromModel(fraud_check_classifier, prefit=True)
# new x_train and y_train_data
X_train_new_feature = feature_model.transform(X_train)
X_test_new_feature = feature_model.transform(X_test)
X_val_new_feature = feature_model.transform(X_val)



#### Make New Classifier

In [71]:
fraud_check_classifier_new = RandomForestClassifier(max_depth=None, max_features=2, n_estimators=200, max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=10, n_jobs=-1, random_state=42)
fraud_check_classifier_new.fit(X_train_new_feature,y_train)

In [72]:
y_val_pred = fraud_check_classifier_new.predict(X_val_new_feature)

# Validation Random Forest Graph

In [73]:
fraud_check_tree = fraud_check_classifier_new.estimators_[0]
class_names = [str(cls) for cls in sorted(np.unique(y_val_pred))]

In [74]:
plt.figure(figsize=(100,20), dpi=600)
plot_tree(fraud_check_tree, filled=True, feature_names=X_val_new_feature, class_names=class_names, rounded=True)
plt.title('Validation Set of the Random Forest After new Feature Selection')
plt.savefig("Validation_Balanced_Random_Forest.svg", format='svg', bbox_inches='tight')
plt.close()

In [75]:
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)
print('Validation Confusion Matrix:\n', val_conf_matrix)
print('Validation Classification Report:\n', val_class_report)

Validation Confusion Matrix:
 [[3028  219]
 [ 166 3098]]
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.93      0.94      3247
           1       0.93      0.95      0.94      3264

    accuracy                           0.94      6511
   macro avg       0.94      0.94      0.94      6511
weighted avg       0.94      0.94      0.94      6511



### Plot the Validation Confusion Matrix

In [91]:
plt.figure(figsize=(8, 6))
sns.heatmap(val_conf_matrix, annot=True, fmt='.2f', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Validation Confusion Matrix (Stratified Sampling)')
plt.savefig("rf_val_bal_matrix.svg")
plt.close()

### Plot the Validation Classification Report

In [77]:
val_class_report = classification_report(y_val, y_val_pred, target_names=class_names, output_dict=True)
df_val_report = pd.DataFrame(val_class_report).transpose()

plt.figure(figsize=(10, 5))
sns.heatmap(df_val_report.iloc[:-1, :].drop(columns=['support']), annot=True, cmap='Blues', cbar=False)
plt.title('Validation Set: Classification Report (Stratified Sampling)')
plt.savefig("rf_val_bal_class_report.svg")
plt.close()

### Plot the Validation Set's Accuracy Score

In [78]:
print("Validation Set Accuracy:", accuracy_score(y_val, y_val_pred))

Validation Set Accuracy: 0.9408692981108893


#### Combine the Validation Set into the Final Trainning Set and Re-Run

In [79]:
X_final_training_set = np.vstack([X_train_new_feature, X_val_new_feature])
y_final_training_set = pd.concat([y_train, y_val])

In [80]:
fraud_check_classifier_final = RandomForestClassifier(n_estimators=100, max_depth=2, n_jobs=-1, random_state=42)
fraud_check_classifier_final.fit(X_final_training_set, y_final_training_set)

# Final evaluation on the test set
y_test_pred = fraud_check_classifier_final.predict(X_test_new_feature)

## Final Random Forest Graph

In [90]:
fraud_check_tree = fraud_check_classifier_final.estimators_[0]
class_names = [str(cls) for cls in sorted(y_final_training_set.unique())]
plt.figure(figsize=(20,10), dpi=300)
plot_tree(fraud_check_tree, filled=True, feature_names=X_final_training_set, class_names=class_names, rounded=True)
plt.title('Decision Tree from Final Random Forest Classifier')
plt.savefig("rf_final_bal_tree.svg", format="svg", bbox_inches="tight")
plt.close()

In [82]:
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred, target_names=class_names)
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', class_report)

Confusion Matrix:
 [[2963  241]
 [ 180 3127]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.92      0.93      3204
           1       0.93      0.95      0.94      3307

    accuracy                           0.94      6511
   macro avg       0.94      0.94      0.94      6511
weighted avg       0.94      0.94      0.94      6511



### Plot confusion matrix

In [94]:
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='.2f', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Final Confusion Matrix (Stratified Sampling)')
plt.savefig("./images/rf_final_bal_confustion_matrix.svg")
plt.close()

### Plot the Final Classification Report 

In [89]:
class_report = classification_report(y_test, y_test_pred, target_names=class_names, output_dict=True)
df_final_report = pd.DataFrame(class_report).transpose()

plt.figure(figsize=(10, 5))
sns.heatmap(df_final_report.iloc[:-1, :].drop(columns=['support']), annot=True, cmap='Blues', cbar=False)
plt.title('Classification Report (Stratified Sampling)')
plt.savefig("rf_final_bal_class_report.svg")
plt.close()

### Plot the Final Accuracy Score

In [85]:
print("Accuracy:", accuracy_score(y_test, y_test_pred))


Accuracy: 0.9353401935186607


### Saving the Created Model for 'Live' Testing

In [86]:
joblib.dump(fraud_check_classifier_final, "fraud_checker_balanced_Random_Forest.pkl")

['fraud_checker_balanced_Random_Forest.pkl']

### Conclusion

Applying the Random Forest Classifier with the multiple independent variables to predict the dependent variable (whether the transaction is fraudlent or not), yielded significant results based on the statistics provided after checking for model accuracy utilizing tuning of hyper-parameters with grid-search and cross-validaiton, and optimal feature selection of the data set.