In [30]:
# Final Project by Zhenghao An
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Load dataset
card_transdata = pd.read_csv('card_transdata.csv')
extract_data = card_transdata.sample(n=5000, random_state=42)  
# Prepare data
X = extract_data.drop('fraud', axis=1)
y = extract_data['fraud']
# Split dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [31]:
# Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [32]:
from sklearn.metrics import accuracy_score, confusion_matrix
accuracy_log_reg = accuracy_score(y_test, log_reg.predict(X_test))
accuracy_log_reg

0.942

In [33]:
conf_matrix_log_reg = confusion_matrix(y_test, log_reg.predict(X_test))
conf_matrix_log_reg

array([[1365,    5],
       [  82,   48]])

In [5]:
# SVM with different kernels
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train, y_train)

In [6]:
accuracy_lsvm = accuracy_score(y_test, svm_linear.predict(X_test))
accuracy_lsvm

0.9446666666666667

In [7]:
conf_matrix_lsvm = confusion_matrix(y_test, svm_linear.predict(X_test))
conf_matrix_lsvm

array([[1365,    5],
       [  78,   52]])

In [8]:
# Naive Bayesian
naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

In [9]:
accuracy_nb = accuracy_score(y_test, naive_bayes.predict(X_test))
accuracy_nb

0.948

In [10]:
conf_matrix_nb = confusion_matrix(y_test, naive_bayes.predict(X_test))
conf_matrix_nb

array([[1358,   12],
       [  66,   64]])

In [11]:
# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [12]:
accuracy_knn = accuracy_score(y_test, knn.predict(X_test))
accuracy_knn

0.9846666666666667

In [13]:
conf_matrix_knn = confusion_matrix(y_test, knn.predict(X_test))
conf_matrix_knn

array([[1363,    7],
       [  16,  114]])

In [14]:
# Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

In [15]:
accuracy_dt = accuracy_score(y_test, decision_tree.predict(X_test))
accuracy_dt

0.9946666666666667

In [16]:
conf_matrix_dt = confusion_matrix(y_test, decision_tree.predict(X_test))
conf_matrix_dt

array([[1368,    2],
       [   6,  124]])

In [17]:
# Random Forest
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

In [18]:
accuracy_rf = accuracy_score(y_test, random_forest.predict(X_test))
accuracy_rf

0.9953333333333333

In [19]:
conf_matrix_rf = confusion_matrix(y_test, random_forest.predict(X_test))
conf_matrix_rf

array([[1369,    1],
       [   6,  124]])

In [20]:
# Linear Discriminant Analysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

In [21]:
accuracy_lda = accuracy_score(y_test, lda.predict(X_test))
accuracy_lda

0.932

In [22]:
conf_matrix_lda = confusion_matrix(y_test, lda.predict(X_test))
conf_matrix_lda

array([[1356,   14],
       [  88,   42]])

In [23]:
svm_gaussian = SVC(kernel='rbf')
svm_gaussian.fit(X_train, y_train)

In [24]:
accuracy_gsvm = accuracy_score(y_test, svm_gaussian.predict(X_test))
accuracy_gsvm

0.9806666666666667

In [25]:
conf_matrix_gsvm = confusion_matrix(y_test, svm_gaussian.predict(X_test))
conf_matrix_gsvm

array([[1368,    2],
       [  27,  103]])

In [26]:
svm_poly = SVC(kernel='poly', degree=2)
svm_poly.fit(X_train, y_train)

In [27]:
accuracy_psvm = accuracy_score(y_test, svm_poly.predict(X_test))
accuracy_psvm

0.9546666666666667

In [28]:
conf_matrix_psvm = confusion_matrix(y_test, svm_poly.predict(X_test))
conf_matrix_psvm

array([[1365,    5],
       [  63,   67]])

In [29]:
from sklearn.metrics import confusion_matrix, accuracy_score

models = [log_reg, knn, decision_tree, random_forest, svm_linear, svm_gaussian, svm_poly, naive_bayes, lda]
model_names = ['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'SVM Linear', 'SVM Gaussian', 'SVM Poly', 'Naive Bayes', 'LDA']

metrics_df = pd.DataFrame(columns=['Model', 'Accuracy(%)', 'TP', 'FP', 'TN', 'FN', 'TPR(%)', 'TNR(%)'])

for model, name in zip(models, model_names):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions) * 100
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    tpr = (tp / (tp + fn)) * 100
    tnr = (tn / (tn + fp)) * 100
    metrics_df = metrics_df.append({
        'Model': name, 
        'Accuracy(%)': f"{accuracy:.3f}".rstrip('0').rstrip('.'), 
        'TP': tp, 
        'FP': fp, 
        'TN': tn, 
        'FN': fn, 
        'TPR(%)': f"{tpr:.3f}".rstrip('0').rstrip('.'), 
        'TNR(%)': f"{tnr:.3f}".rstrip('0').rstrip('.')
    }, ignore_index=True)
def highlight_max_min(s):
    is_max = s == s.max()
    is_min = s == s.min()
    return ['background-color: red' if v else 'background-color: green' if m else '' for v, m in zip(is_max, is_min)]

styled_df = metrics_df.style.apply(highlight_max_min, subset=['Accuracy(%)', 'TP', 'FP', 'TN', 'FN', 'TPR(%)', 'TNR(%)'])
styled_df



  metrics_df = metrics_df.append({
  metrics_df = metrics_df.append({
  metrics_df = metrics_df.append({
  metrics_df = metrics_df.append({
  metrics_df = metrics_df.append({
  metrics_df = metrics_df.append({
  metrics_df = metrics_df.append({
  metrics_df = metrics_df.append({
  metrics_df = metrics_df.append({


Unnamed: 0,Model,Accuracy(%),TP,FP,TN,FN,TPR(%),TNR(%)
0,Logistic Regression,94.2,48,5,1365,82,36.923,99.635
1,KNN,98.467,114,7,1363,16,87.692,99.489
2,Decision Tree,99.467,124,2,1368,6,95.385,99.854
3,Random Forest,99.533,124,1,1369,6,95.385,99.927
4,SVM Linear,94.467,52,5,1365,78,40.0,99.635
5,SVM Gaussian,98.067,103,2,1368,27,79.231,99.854
6,SVM Poly,95.467,67,5,1365,63,51.538,99.635
7,Naive Bayes,94.8,64,12,1358,66,49.231,99.124
8,LDA,93.2,42,14,1356,88,32.308,98.978
