# Scalable Vector Machine Classifier for Project Proposal

## Import Necessary Libraries

In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn import svm
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
import joblib
import random

### Read Data from .csv File

In [108]:
card_data = pd.read_csv("card_transdata.csv")

### Display All Data

In [109]:
card_data.describe(include="all")

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,26.628792,5.036519,1.824182,0.881536,0.350399,0.100608,0.650552,0.087403
std,65.390784,25.843093,2.799589,0.323157,0.477095,0.300809,0.476796,0.282425
min,0.004874,0.000118,0.004399,0.0,0.0,0.0,0.0,0.0
25%,3.878008,0.296671,0.475673,1.0,0.0,0.0,0.0,0.0
50%,9.96776,0.99865,0.997717,1.0,0.0,0.0,1.0,0.0
75%,25.743985,3.355748,2.09637,1.0,1.0,0.0,1.0,0.0
max,10632.723672,11851.104565,267.802942,1.0,1.0,1.0,1.0,1.0


### Drop All NaN / NA Values from Dataset

In [110]:
card_data.dropna()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.311140,1.945940,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
999995,2.207101,0.112651,1.626798,1.0,1.0,0.0,0.0,0.0
999996,19.872726,2.683904,2.778303,1.0,1.0,0.0,0.0,0.0
999997,2.914857,1.472687,0.218075,1.0,1.0,0.0,1.0,0.0
999998,4.258729,0.242023,0.475822,1.0,0.0,0.0,1.0,0.0


### Get Unique Values From the Dependent Vairable Column (Whether the Transction is Fraulent or Not)

In [111]:
print(np.unique(card_data['fraud']))

[0. 1.]


### Now that we know they are unique values of 0 - false or 1 - true

- Convert all floats to ints

In [112]:
card_data["repeat_retailer"] = card_data["repeat_retailer"].astype(int)
card_data["used_chip"] = card_data["used_chip"].astype(int)
card_data["used_pin_number"] = card_data["used_pin_number"].astype(int)
card_data['online_order'] = card_data['online_order'].astype(int)
card_data['fraud'] = card_data['fraud'].astype(int)
print(card_data.dtypes)

distance_from_home                float64
distance_from_last_transaction    float64
ratio_to_median_purchase_price    float64
repeat_retailer                     int64
used_chip                           int64
used_pin_number                     int64
online_order                        int64
fraud                               int64
dtype: object


### Radomize the Samples to Drop

In [113]:
random.seed(42)
rows_to_drop = random.sample(range(len(card_data)), 900000)

In [114]:
trimmed_card_data = card_data.drop(rows_to_drop)

In [115]:
trimmed_card_data.describe()

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,26.553782,5.082702,1.816815,0.88095,0.34896,0.10108,0.64952,0.08624
std,63.877367,24.597312,2.782527,0.323849,0.476644,0.301436,0.477123,0.28072
min,0.035304,0.000434,0.00737,0.0,0.0,0.0,0.0,0.0
25%,3.869257,0.299278,0.47816,1.0,0.0,0.0,0.0,0.0
50%,10.013285,0.993671,0.994689,1.0,0.0,0.0,1.0,0.0
75%,25.792695,3.3614,2.096633,1.0,1.0,0.0,1.0,0.0
max,3981.367512,2724.273459,99.117077,1.0,1.0,1.0,1.0,1.0


In [116]:
print(trimmed_card_data['fraud'].value_counts())

fraud
0    91376
1     8624
Name: count, dtype: int64


### Even out the Amount of Fraudulent and Non-Fraudulent Transactions

In [117]:
minimum_samples = min(trimmed_card_data['fraud'].value_counts())

In [118]:
print(minimum_samples)

8624


In [119]:
fraud_samples = trimmed_card_data[trimmed_card_data['fraud'] == 1].sample(minimum_samples,random_state=42)
non_fraud_samples = trimmed_card_data[trimmed_card_data['fraud'] == 0].sample(minimum_samples,random_state=42)    

### Confirm the Data Set is Even

In [120]:
print(len(fraud_samples))
print(len(non_fraud_samples))

8624
8624


### Combine the Datasets

In [121]:
balanced_card_data = pd.concat([fraud_samples,non_fraud_samples])

### Create the Trainning, Testing, and Validation Sets

Choose any 'X' Independent Variable we want to use for the prediction

In [122]:
X = balanced_card_data[['distance_from_home','distance_from_last_transaction','ratio_to_median_purchase_price','repeat_retailer','used_chip','used_pin_number','online_order']]
y = balanced_card_data['fraud']

In [123]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp,y_temp, test_size=0.5, stratify=y_temp, random_state=42)

### Create the Scalable Vector Machine

## Use Grid Search For Hyper-Parameter Tuning 

In [124]:
# Can uncomment top section to take, Grid Search takes awhile
'''param_grid = {
    'C':[13,15,17,21],
    'kernel':["rbf"],
    "gamma":["auto"],
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=fraud_check_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=3)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score achieved during the grid search
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)'''

'param_grid = {\n    \'C\':[13,15,17,21],\n    \'kernel\':["rbf"],\n    "gamma":["auto"],\n}\n\n# Initialize the GridSearchCV object\ngrid_search = GridSearchCV(estimator=fraud_check_classifier, param_grid=param_grid, cv=5, scoring=\'accuracy\', n_jobs=-1, verbose=3)\n\n# Fit the grid search to the data\ngrid_search.fit(X_train, y_train)\n\n# Print the best parameters and the best score achieved during the grid search\nprint("Best parameters:", grid_search.best_params_)\nprint("Best cross-validation score:", grid_search.best_score_)'

In [125]:
fraud_check_classifier = svm.SVC(kernel="rbf")

### Fit the Model on the Trainning Set

#### Predict the Results

In [126]:
fraud_check_classifier.fit(X_train,y_train)

### Build Standardized PCA Pipe-line to Visualize the Multi-Dimensional Data

In [127]:
pipeline=Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2))
])
X_train_scaled_pca = pipeline.fit_transform(X_train)

In [128]:
pca_pipeline_step = pipeline.named_steps['pca']
loadings = pca_pipeline_step.components_
loadings_df = pd.DataFrame(data=loadings, columns=X_train.columns, index=[f'PC{i+1}' for i in range(pca_pipeline_step.n_components)])
display(loadings_df)

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order
PC1,0.649231,0.005839,-0.15363,0.531136,-0.388894,-0.150952,0.314217
PC2,-0.113151,-0.062503,0.66976,-0.082057,0.056144,-0.486259,0.537008


### Initial Scalable Vectore Machine Classifier (SVC)

In [129]:
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_train_scaled_pca[:,0], X_train_scaled_pca[:,1], c=y_train, cmap="viridis", edgecolor='k', s=150)
plt.xlabel("Principle Component 1")
plt.ylabel("Principle Component 2")
plt.title("Scaled PCA Projection of Credit Card Fraud Detection (SVC)")
plt.colorbar(scatter)
plt.savefig("./images/svc_init_bal.svg", format="svg")
plt.close()

### Re-fine through Feature Selection

In [130]:
from sklearn.inspection import permutation_importance
# Evaluating feature importance
results = permutation_importance(fraud_check_classifier, X_test, y_test, n_repeats=10, random_state=42)

# Displaying importance
importances = results.importances_mean
print(importances)

[ 1.42310665e-01  3.93740340e-02  3.51661515e-01  0.00000000e+00
 -3.86398764e-05 -1.93199382e-04  1.62287481e-03]


In [131]:
# Select Features that are Relevant Based on Importances
threshold = np.median(importances)
indices = np.where(importances > threshold)[0]
# new x_train and y_train_data
X_train_new_feature = X_train.iloc[:, indices]
X_test_new_feature = X_test.iloc[:, indices]
X_val_new_feature = X_val.iloc[:, indices]

In [132]:
print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
print("Types:", type(X_train), type(X_val), type(X_test))

Shapes: (12073, 7) (2587, 7) (2588, 7)
Types: <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


#### Make New Classifier

In [133]:
fraud_check_classifier_new = svm.SVC(C=15, gamma='auto', kernel='rbf',random_state=42)
fraud_check_classifier_new.fit(X_train_new_feature,y_train)

In [134]:
y_val_pred = fraud_check_classifier_new.predict(X_val_new_feature)

In [135]:
pipeline=Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2))
])
X_val_scaled_pca = pipeline.fit_transform(X_val_new_feature)
pca_pipeline_step = pipeline.named_steps['pca']
loadings = pca_pipeline_step.components_
loadings_df = pd.DataFrame(data=loadings, columns=X_val_new_feature.columns, index=[f'PC{i+1}' for i in range(pca_pipeline_step.n_components)])
display(loadings_df)

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price
PC1,-0.622042,-0.30322,0.721887
PC2,-0.488947,0.870536,-0.055662


### Plot the Validation SVC PCA - Representation

In [136]:
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_val_scaled_pca[:,0], X_val_scaled_pca[:,1], c=y_val_pred, cmap="viridis", edgecolor='k', s=150)
plt.xlabel("Principle Component 1")
plt.ylabel("Principle Component 2")
plt.title("Scaled PCA Projection of Validation Model for Credit Card Fraud Detection (SVC)")
plt.colorbar(scatter)
plt.savefig("./images/svc_val_bal.svg", format="svg")
plt.close()

In [137]:
val_conf_matrix = confusion_matrix(y_val, y_val_pred)
val_class_report = classification_report(y_val, y_val_pred)
class_names = [str(cls) for cls in sorted(np.unique(y_val_pred))]
print('Validation Confusion Matrix:\n', val_conf_matrix)
print('Validation Classification Report:\n', val_class_report)

Validation Confusion Matrix:
 [[1163  131]
 [  47 1246]]
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.90      0.93      1294
           1       0.90      0.96      0.93      1293

    accuracy                           0.93      2587
   macro avg       0.93      0.93      0.93      2587
weighted avg       0.93      0.93      0.93      2587



### Plot the Validation Confusion Matrix

In [138]:
plt.figure(figsize=(8, 6))
sns.heatmap(val_conf_matrix, annot=True, fmt='.2f', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Validation Confusion Matrix (Stratified Sampling)')
plt.savefig("./images/val_bal_svc_matrix.svg")
plt.close()

### Plot the Validation Classification Report

In [139]:
val_class_report = classification_report(y_val, y_val_pred, target_names=class_names, output_dict=True)
df_val_report = pd.DataFrame(val_class_report).transpose()

plt.figure(figsize=(10, 5))
sns.heatmap(df_val_report.iloc[:-1, :].drop(columns=['support']), annot=True, cmap='Blues', cbar=False)
plt.title('Validation Set: Classification Report (Stratified Sampling)')
plt.savefig("./images/val_bal_svc_report.svg")
plt.close()

#### Combine the Validation Set into the Final Trainning Set and Re-Run

In [140]:
X_final_training_set = np.vstack([X_train_new_feature, X_val_new_feature])
y_final_training_set = pd.concat([y_train, y_val])

In [141]:
fraud_check_classifier_final = svm.SVC(C=15, gamma='auto', kernel='rbf',random_state=42)
fraud_check_classifier_final.fit(X_final_training_set, y_final_training_set)

# Final evaluation on the test set
y_test_pred = fraud_check_classifier_final.predict(X_test_new_feature)



### Build Standardized PCA Pipe-line to Visualize the Multi-Dimensional Data

In [142]:
pipeline=Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=2))
])
X_final_train_scaled_pca = pipeline.fit_transform(X_final_training_set)
X_final_test_scaled_pca = pipeline.fit_transform(X_test_new_feature)

In [143]:
pca_pipeline_step = pipeline.named_steps['pca']
loadings = pca_pipeline_step.components_
columns = ["distance_from_home","ratio_to_median_purchase_price","online_order"]
loadings_df = pd.DataFrame(data=loadings, columns=columns, index=[f'PC{i+1}' for i in range(pca_pipeline_step.n_components)])
display(loadings_df)

Unnamed: 0,distance_from_home,ratio_to_median_purchase_price,online_order
PC1,-0.489517,-0.445961,0.749328
PC2,-0.685672,0.72776,-0.014807


### Final Scalable Vectore Machine Classifier (SVC)

In [144]:
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_final_test_scaled_pca[:,0], X_final_test_scaled_pca[:,1], c=y_test_pred, cmap="viridis", edgecolor='k', s=150)
plt.xlabel("Principle Component 1")
plt.ylabel("Principle Component 2")
plt.title("Final Scaled PCA Projection of Credit Card Fraud Detection (SVC)")
plt.colorbar(scatter)
plt.savefig("./images/svc_final_bal.svg", format="svg")
plt.close()

In [145]:
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred, target_names=class_names)
print('Confusion Matrix:\n', conf_matrix)
print('Classification Report:\n', class_report)

Confusion Matrix:
 [[1165  129]
 [  45 1249]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.90      0.93      1294
           1       0.91      0.97      0.93      1294

    accuracy                           0.93      2588
   macro avg       0.93      0.93      0.93      2588
weighted avg       0.93      0.93      0.93      2588



### Plot confusion matrix

In [146]:
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='.2f', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.savefig("./images/final_bal_conf_matrix_svc.svg", format="svg", bbox_inches="tight")
plt.close()

### Plot the Final Classification Report 

In [147]:
class_report = classification_report(y_test, y_test_pred, target_names=class_names, output_dict=True)
df_final_report = pd.DataFrame(class_report).transpose()

plt.figure(figsize=(10, 5))
sns.heatmap(df_final_report.iloc[:-1, :].drop(columns=['support']), annot=True, cmap='Blues', cbar=False)
plt.title('Classification Report (Stratified Sampling)')
plt.savefig("./images/final_bal_class_report_svc.svg", format="svg", bbox_inches="tight")
plt.close()

### Plot the Final Accuracy Score

In [148]:
print("Accuracy:", accuracy_score(y_test, y_test_pred))


Accuracy: 0.9327666151468316


### Saving the Created Model for 'Live' Testing

In [149]:
joblib.dump(fraud_check_classifier_final, "./models/fraud_checker_bal_Non-Linear_Scalable_Vector_Classifier.pkl")

['./models/fraud_checker_bal_Non-Linear_Scalable_Vector_Classifier.pkl']

### Conclusion

Applying the Non-Linear Scalable Vector Classifier with the multiple independent variables to predict the dependent variable (whether the transaction is fraudlent or not), yielded significant results based on the statistics provided after checking for model accuracy utilizing tuning of hyper-parameters with grid-search and cross-validaiton, and optimal feature selection of the data set.