## Credit Card Fraud Case Study

### Importing basic modules

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_halving_search_cv
from sklearn.metrics import accuracy_score, mean_absolute_error ,mean_squared_error, confusion_matrix, median_absolute_error,classification_report, f1_score,recall_score,precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import HalvingRandomSearchCV,RandomizedSearchCV

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
warnings.filterwarnings('ignore')

seed = 39

train = pd.read_csv('../input/fraud-detection/fraudTrain.csv')
test = pd.read_csv('../input/fraud-detection/fraudTest.csv')
test.head()

### Importing Data

In [None]:
print(test.shape),print(train.shape)
print(test.isnull().sum())
print(train.isnull().sum())
print(test.info(), train.info())

### Data Cleaning 

Converting dob,trans_date_trans_time  column in both test & train to datetime data type and creating new 'trans_date' column - 

In [None]:
train['trans_date_trans_time']=pd.to_datetime(train['trans_date_trans_time'])
train['trans_date']=train['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
train['trans_date']=pd.to_datetime(train['trans_date'])
train['dob']=pd.to_datetime(train['dob'])

test['trans_date_trans_time']=pd.to_datetime(test['trans_date_trans_time'])
test['trans_date']=test['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
test['trans_date']=pd.to_datetime(test['trans_date'])
test['dob']=pd.to_datetime(test['dob'])
test.trans_date.head(),test.dob.head(),train.trans_date.head(),train.dob.head()

train.drop("Unnamed: 0",axis=1,inplace=True)
test.drop("Unnamed: 0",axis=1,inplace=True)
train.head()

Removing unnamed column

### EDA, Feature Engineering

### Categorical Variable Analysis

In [None]:
total = pd.concat([test,train])
print(total.info())

total["is_fraud_cat"]=total.is_fraud.apply(lambda x: "T" if x==1 else "F")
total["is_fraud_cat"].astype("object")

totalcat=total.select_dtypes(include=['object'])

total[totalcat.columns]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'total' is your DataFrame and it contains 'is_fraud_cat' and 'category' columns
# Create a count plot of the 'category' column for rows where 'is_fraud_cat' is "T"
sns.countplot(total[total['is_fraud_cat'] == "T"].category)

# Rotate the x-axis labels for better readability
plt.xticks(rotation=90)

# Save the figure
plt.savefig('fraud_by_category.pdf', bbox_inches='tight', dpi=300)

# Display the plot
plt.show()

In [None]:
# fig, ax = plt.subplots(figsize=(80,60))
# plt.rcParams.update({'font.size': 60})
# sns.countplot(total[total['is_fraud_cat']=="T"].state)
# plt.xticks(rotation=45)
# for p, label in zip(ax.patches, total["state"].value_counts(sort=True,ascending=False).head(10)):
#     ax.annotate(label, (p.get_x(), p.get_height()+0.25))
# plt.title("Number of Credit Card Frauds by State")

# plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'total' is your DataFrame and it contains 'is_fraud_cat' and 'state' columns
# Filter the data to include only fraudulent transactions
fraud_data = total[total['is_fraud_cat'] == "T"]

# Calculate the count of frauds by state and get the top 10 states
top_states = fraud_data['state'].value_counts().head(10).index

# Filter the fraud_data to include only the top 10 states
top_fraud_data = fraud_data[fraud_data['state'].isin(top_states)]

# Now create the count plot for only the top 10 states
fig, ax = plt.subplots(figsize=(80, 60))
plt.rcParams.update({'font.size': 60})

# Use the filtered top_fraud_data for plotting
sns.countplot(x='state', data=top_fraud_data, order=top_states)

plt.xticks(rotation=45)

# Annotate the bars with the count of frauds
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.1f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 9), 
                textcoords = 'offset points')

plt.title("Number of Credit Card Frauds by State")
plt.savefig('fraud_by_state.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
import random
def randomcolor():
    r = random.random()
    b = random.random()
    g = random.random()
    rgb = [r,g,b]
    return rgb
plt.rcParams.update({'font.size': 20})
total[total['is_fraud_cat']=="T"]["city"].value_counts(sort=True,ascending=False).head(10).plot(kind="bar",color=randomcolor())
plt.title("Number of Credit Card Frauds by City")
plt.savefig('fraud_by_city.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
total[total['is_fraud_cat']=="T"]["job"].value_counts(sort=True,ascending=False).head(10).plot(kind="bar",color=randomcolor())
plt.title("Number of Credit Card Frauds by Job")
plt.savefig('fraud_by_job.pdf', bbox_inches='tight', dpi=300)
plt.show()

### Numerical Variable Analysis
checking the spread & skewness of all numerical variables

In [None]:
del total['is_fraud_cat']
from scipy.stats import norm, skew

#finding numerical columns
testnum= test.select_dtypes(include=np.number)
test[testnum.columns]

total.isnull().sum()

total[testnum.columns].info()

In [None]:
plt.rcParams.update({'font.size': 10})
skewness = str(skew(total['amt']))
sns.distplot(total['amt'],fit = norm,color = randomcolor())
plt.title("Skewness of amt"+" = "+skewness)
plt.savefig('Skewness_of_amt.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
skewness = str(skew(total['city_pop']))
sns.distplot(total['city_pop'],fit = norm,color = randomcolor())
plt.title("Skewness of population"+" = "+skewness)
plt.savefig('Skewness_of_pop.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
skewness = str(skew(total['city_pop']))
sns.distplot(total['cc_num'],fit = norm,color = randomcolor())
plt.title("Skewness of cc_num"+" = "+skewness)
plt.savefig('Skewness_of_cc_num.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
sns.distplot(total['is_fraud'],fit = norm,color = randomcolor())
plt.title("Distribution of is_fraud")
plt.savefig('dis_is_fraud.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
total.drop(['cc_num','merchant','first','last','street','zip','trans_num','unix_time'],axis=1,inplace=True)
# total.info()

In [None]:
plt.figure(figsize=(8,5))
ax = sns.countplot(x="is_fraud", data=total,color=randomcolor())
for p in ax.patches:
     ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+0.01))
        
plt.savefig('bar_label.pdf', bbox_inches='tight', dpi=300)        
plt.show()


In [None]:
total["age"] = total["trans_date"]-total["dob"]
total["age"]=total["age"].astype('timedelta64[Y]')
print(total["age"].head())
# print(total.info())

In [None]:
fraud=total[total["is_fraud"]==1]
fig, ax = plt.subplots()
ax.hist(fraud.age, edgecolor = "black", bins = 5, color=randomcolor())
plt.title("Number of Credit Card Frauds by Age Groups")
plt.savefig('Fraud_by_age_group.pdf', bbox_inches='tight', dpi=300)  
plt.show()

In [None]:
total['trans_month'] = pd.DatetimeIndex(total['trans_date']).month
total['trans_year'] = pd.DatetimeIndex(total['trans_date']).year

import calendar
total['Month_name'] = total['trans_month'].apply(lambda x: calendar.month_abbr[x])

In [None]:
sns.countplot(total[total["is_fraud"]==1]["Month_name"],color=randomcolor())
plt.title("Number of Credit Card Frauds by month")
plt.savefig('Fraud_by_month.pdf', bbox_inches='tight', dpi=300) 
plt.show()
del total['Month_name']

In [None]:
sns.countplot(total[total["is_fraud"]==1]["gender"],color=randomcolor())
plt.title("Number of Credit Card Frauds by Gender")
plt.savefig('Fraud_by_gender.pdf', bbox_inches='tight', dpi=300) 
plt.show()

In [None]:
sns.countplot(total[total["is_fraud"]==1]["trans_year"],color=randomcolor())
plt.title("Number of Credit Card Frauds by year")
plt.savefig('Fraud_by_year.pdf', bbox_inches='tight', dpi=300) 
plt.show()

In [None]:
total['latitudinal_distance'] = abs(round(total['merch_lat']-total['lat'],3))
total['longitudinal_distance'] = abs(round(total['merch_long']-total['long'],3))

In [None]:
fraud = total[total["is_fraud"]==1]
fig, ax = plt.subplots()
ax.hist(fraud.latitudinal_distance, edgecolor = "black", bins = 5, color=randomcolor())
plt.title("Number of Credit Card Frauds by latitudinal distance")
plt.savefig('Fraud_by_latitude.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.hist(fraud.longitudinal_distance, edgecolor = "black", bins = 5, color=randomcolor())
plt.title("Number of Credit Card Frauds by longitudinal distance")
plt.savefig('Fraud_by_longtitude.pdf', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
# print(total.info())

# print(total.gender.value_counts())

total.gender=total.gender.apply(lambda x: 1 if x=="M" else 0)
total.gender.value_counts()

drop_cols = ['trans_date_trans_time','city','lat','long','job','dob','merch_lat','merch_long','trans_date','state']
total=total.drop(drop_cols,axis=1)
# total.info()

total = pd.get_dummies(total,columns=['category'],drop_first=True)
print(total.info())
total.head()

Dropping final set of variables not useful for model building

### Model Building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
#create two different dataframe of majority and minority class 
df_majority = total[(total['is_fraud']==0)] 
df_minority = total[(total['is_fraud']==1)] 

df_majority.shape,df_minority.shape

# Method 0: No sampling (BASELINE)

In [None]:
x_train_ori_col = list(total.columns)
x_train_ori_col.remove('is_fraud')
x_train_ori_col

X_ori = total[x_train_ori_col]
Y_ori = total['is_fraud']
print(X_ori.info())

X_train_ori, X_test_ori, Y_train_ori, Y_test_ori = train_test_split(
 X_ori, Y_ori, test_size=0.3, random_state=seed)

In [None]:
sc= StandardScaler()
X_train_ori_std=sc.fit_transform(X_train_ori)
X_test_ori_std = sc.fit_transform(X_test_ori)

## FIND important feature based on original data distribution

In [None]:
logit_model_no_sampling = LogisticRegression(solver='liblinear',random_state=seed)
logit_model_no_sampling.fit(X_train_ori_std, Y_train_ori)

In [None]:
feature= pd.DataFrame()
feature['column']= X_train_ori.columns
feature['importance']= logit_model_no_sampling.coef_[0]
feature.sort_values('importance', ascending=False, inplace=True)
feature

## Train and Finetuning

In [None]:
plt.figure(figsize=(8,5))
ax = sns.countplot(x="is_fraud", data=total,color=randomcolor())
for p in ax.patches:
     ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+0.01))
plt.show()

`X_train_ori, X_test_ori, Y_train_ori, Y_test_ori`

In [None]:
Logit1_ori=LogisticRegression(solver='liblinear',random_state=seed)

Logit1_ori.fit(X_train_ori_std,Y_train_ori)

print("Score of the model with X-train and Y-train is : ", str(round(Logit1_ori.score(X_train_ori,Y_train_ori)*100,2)),"%")
print("Score of the model with X-test and Y-test is : ", str(round(Logit1_ori.score(X_test_ori,Y_test_ori)*100,2)),"%")

Y_pred_ori=Logit1_ori.predict(X_test_ori_std)

print( " Mean absolute error is ",( mean_absolute_error(Y_test_ori,Y_pred_ori)))
print(" Mean squared  error is " , mean_squared_error(Y_test_ori,Y_pred_ori))
print(" Median absolute error is " ,median_absolute_error(Y_test_ori,Y_pred_ori)) 
print("Accuracy is " , round(accuracy_score(Y_test_ori,Y_pred_ori)*100,2),"%")
print("F1 score: ", round(f1_score(Y_test_ori, Y_pred_ori, average='weighted')*100,2),"%")
print("Recall:", round(recall_score(Y_test_ori, Y_pred_ori, average='weighted') * 100, 2), "%")

In [None]:
X_train_ori_new = X_train_ori[[x for x in feature[feature["importance"]>0].column]]
X_test_ori_new = X_test_ori[[x for x in feature[feature["importance"]>0].column]]

In [None]:
X_train_ori_sm = sm.add_constant(X_train_ori_new)
logm = sm.GLM(Y_train_ori, X_train_ori_sm, family = sm.families.Binomial())
res = logm.fit()
res.summary()

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train_ori_new.columns
vif['VIF'] = [variance_inflation_factor(X_train_ori_new.values, i) for i in range(X_train_ori_new.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# since all columns have VIF < 5 , we'll continue with all columns
#x_train_vif_adj = X_train_new[[x for x in list(vif[vif['VIF']<=5]['Features'])]]
x_train_ori_vif_adj = X_train_ori_new
#x_test_vif_adj = X_test_new[[x for x in list(vif[vif['VIF']<=5]['Features'])]]
x_test_ori_vif_adj = X_test_ori_new

In [None]:
sc= StandardScaler()
X_train_ori_vif_adj_std=sc.fit_transform(x_train_ori_vif_adj)
X_test_ori_vif_adj_std = sc.fit_transform(x_test_ori_vif_adj)

In [None]:
Logit2_ori=LogisticRegression(solver='liblinear',random_state=seed)

Logit2_ori.fit(X_train_ori_vif_adj_std,Y_train_ori)

print("Score of the model with X-train and Y-train is : ", str(round(Logit2_ori.score(X_train_ori_vif_adj_std,Y_train_ori)*100,2)),"%")
print("Score of the model with X-test and Y-test is : ", str(round(Logit2_ori.score(X_test_ori_vif_adj_std,Y_test_ori)*100,2)),"%")

Y_pred_ori=Logit2_ori.predict(X_test_ori_vif_adj_std)

print( " Mean absolute error is ",( mean_absolute_error(Y_test_ori,Y_pred_ori)))
print(" Mean squared  error is " , mean_squared_error(Y_test_ori,Y_pred_ori))
print(" Median absolute error is " ,median_absolute_error(Y_test_ori,Y_pred_ori)) 
print("Accuracy is " , round(accuracy_score(Y_test_ori,Y_pred_ori)*100,2),"%")
print("F1 score: ", round(f1_score(Y_test_ori, Y_pred_ori, average='weighted')*100,2),"%")
print("Recall:", round(recall_score(Y_test_ori, Y_pred_ori, average='weighted') * 100, 2), "%")

In [None]:
matrix = confusion_matrix(Y_test_ori,Y_pred_ori, labels=[1,0])
print('Confusion matrix : \n',matrix)


tp, fn, fp, tn = confusion_matrix(Y_test_ori,Y_pred_ori,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)


matrix = classification_report(Y_test_ori,Y_pred_ori,labels=[1,0])
print('Classification report : \n',matrix)

In [None]:
m0_lg_Recall = recall_score(Y_test_ori, Y_pred_ori, average='macro')
m0_lg_Precision = precision_score(Y_test_ori, Y_pred_ori,average='macro')
m0_lg_f1 = f1_score(Y_test_ori, Y_pred_ori,average='macro')
m0_lg_accuracy = accuracy_score(Y_test_ori, Y_pred_ori)
m0_lg_mae = mean_absolute_error(Y_test_ori,Y_pred_ori)
m0_lg_mse = mean_squared_error(Y_test_ori,Y_pred_ori)

m0_ndf = [(m0_lg_Recall, m0_lg_Precision, m0_lg_f1, m0_lg_accuracy,m0_lg_mae,m0_lg_mse)]
m0_lg_score = pd.DataFrame(data = m0_ndf, columns=['Recall','Precision','F1 Score', 'Accuracy','MAE','MSE'])
m0_lg_score.insert(0, 'Logistic Regression with', 'No Oversampling')
m0_lg_score

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve, auc

# 假设你已经有了预测结果和真实标签，计算ROC曲线的坐标
fpr, tpr, thresholds = roc_curve(Y_test_ori, Logit2_ori.predict_proba(X_test_ori_vif_adj_std)[:,1])
roc_auc = auc(fpr, tpr)

# 绘制ROC曲线
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)

# 添加对角线
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

# 设定坐标轴标签和图的标题
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')

# 将图例放置在图表外的右侧
plt.legend(loc="lower right", bbox_to_anchor=(1.05, 0.5), borderaxespad=0.)

# 调整图表边界，为图例腾出空间
plt.subplots_adjust(right=0.75)

# 展示图表
plt.show()

# Method 1: Correcting the imbalance discovered by using resample

In [None]:
# upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,    # sample with replacement
                                 n_samples= 1842743, # to match majority class
                                 random_state=seed)  # reproducible results
df_minority_upsampled.shape

In [None]:
# Combine majority class with upsampled minority class
total_upsampled = pd.concat([df_minority_upsampled, df_majority])
total_upsampled.shape

In [None]:
x_cols = list(total_upsampled.columns)
x_cols.remove('is_fraud')
# x_cols

X = total_upsampled[x_cols]
Y = total_upsampled['is_fraud']
# X.info()

X_train, X_test, Y_train, Y_test = train_test_split(
 X, Y, test_size=0.3, random_state=seed)

### Scaling the x variables

In [None]:
sc= StandardScaler()
X_train_std=sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)

### Feature Importances

In [None]:
logit_model= LogisticRegression(solver='liblinear',random_state=seed)
logit_model.fit(X_train_std, Y_train)

In [None]:
import numpy as np

# 假设Y_train已经通过train_test_split获取
unique, counts = np.unique(Y_train, return_counts=True)
class_counts = dict(zip(unique, counts))

print("类别数量:", class_counts)

In [None]:
feature= pd.DataFrame()
feature['column']= X_train.columns
feature['importance']= logit_model.coef_[0]
feature.sort_values('importance', ascending=False, inplace=True)
feature

## Logistic Regression

In [None]:
plt.figure(figsize=(8,5))
ax = sns.countplot(x="is_fraud", data=total_upsampled,color=randomcolor())
for p in ax.patches:
     ax.annotate('{:.1f}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+0.01))
plt.show()

In [None]:
Logit1=LogisticRegression(solver='liblinear',random_state=seed)

Logit1.fit(X_train_std,Y_train)

print("Score of the model with X-train and Y-train is : ", str(round(Logit1.score(X_train,Y_train)*100,2)),"%")
print("Score of the model with X-test and Y-test is : ", str(round(Logit1.score(X_test,Y_test)*100,2)),"%")

Y_pred=Logit1.predict(X_test_std)

print( " Mean absolute error is ",( mean_absolute_error(Y_test,Y_pred)))
print(" Mean squared  error is " , mean_squared_error(Y_test,Y_pred))
print(" Median absolute error is " ,median_absolute_error(Y_test,Y_pred)) 
print("Accuracy is " , round(accuracy_score(Y_test,Y_pred)*100,2),"%")
print("F1 score: ", round(f1_score(Y_test, Y_pred, average='weighted')*100,2),"%")
print("Recall:", round(recall_score(Y_test, Y_pred, average='weighted') * 100, 2), "%")

## Method 1: Fine Tuning

In [None]:
X_train_new = X_train[[x for x in feature[feature["importance"]>0].column]]
X_test_new = X_test[[x for x in feature[feature["importance"]>0].column]]

checking p values & variance inflation factor

In [None]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
X_train_sm = sm.add_constant(X_train_new)
logm = sm.GLM(Y_train, X_train_sm, family = sm.families.Binomial())
res = logm.fit()
res.summary()

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train_new.columns
vif['VIF'] = [variance_inflation_factor(X_train_new.values, i) for i in range(X_train_new.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

since all columns have VIF < 5 , we'll continue with all columns

In [None]:
#x_train_vif_adj = X_train_new[[x for x in list(vif[vif['VIF']<=5]['Features'])]]
x_train_vif_adj = X_train_new
#x_test_vif_adj = X_test_new[[x for x in list(vif[vif['VIF']<=5]['Features'])]]
x_test_vif_adj = X_test_new


Scaling the new test and train sets

In [None]:
sc= StandardScaler()
X_train_vif_adj_std=sc.fit_transform(x_train_vif_adj)
X_test_vif_adj_std = sc.fit_transform(x_test_vif_adj)

Training a new Logistic Regression Model to reflect the changes-

In [None]:
Logit2=LogisticRegression(solver='liblinear',random_state=seed)

Logit2.fit(X_train_vif_adj_std,Y_train)

print("Score of the model with X-train and Y-train is : ", str(round(Logit2.score(X_train_vif_adj_std,Y_train)*100,2)),"%")
print("Score of the model with X-test and Y-test is : ", str(round(Logit2.score(X_test_vif_adj_std,Y_test)*100,2)),"%")

Y_pred=Logit2.predict(X_test_vif_adj_std)

print( " Mean absolute error is ",( mean_absolute_error(Y_test,Y_pred)))
print(" Mean squared  error is " , mean_squared_error(Y_test,Y_pred))
print(" Median absolute error is " ,median_absolute_error(Y_test,Y_pred)) 
print("Accuracy is " , round(accuracy_score(Y_test,Y_pred)*100,2),"%")
print("F1 score: ", round(f1_score(Y_test, Y_pred, average='weighted')*100,2),"%")
print("Recall:", round(recall_score(Y_test, Y_pred, average='weighted') * 100, 2), "%")

In [None]:
matrix = confusion_matrix(Y_test,Y_pred, labels=[1,0])
print('Confusion matrix : \n',matrix)


tp, fn, fp, tn = confusion_matrix(Y_test,Y_pred,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)


matrix = classification_report(Y_test,Y_pred,labels=[1,0])
print('Classification report : \n',matrix)

In [None]:
m1_lg_Recall = recall_score(Y_test, Y_pred, average='macro')
m1_lg_Precision = precision_score(Y_test, Y_pred, average='macro')
m1_lg_f1 = f1_score(Y_test, Y_pred, average='macro')
m1_lg_accuracy = accuracy_score(Y_test, Y_pred)
m1_lg_mae = mean_absolute_error(Y_test,Y_pred)
m1_lg_mse = mean_squared_error(Y_test,Y_pred)

m1_ndf = [(m1_lg_Recall, m1_lg_Precision, m1_lg_f1, m1_lg_accuracy,m1_lg_mae,m1_lg_mse)]
m1_lg_score = pd.DataFrame(data = m1_ndf, columns=['Recall','Precision','F1 Score', 'Accuracy','MAE','MSE'])
m1_lg_score.insert(1, 'Logistic Regression with', 'Resampling')
m1_lg_score

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve, auc

# 假设你已经有了预测结果和真实标签，计算ROC曲线的坐标
fpr, tpr, thresholds = roc_curve(Y_test, Logit2.predict_proba(X_test_vif_adj_std)[:,1])
roc_auc = auc(fpr, tpr)

# 绘制ROC曲线
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)

# 添加对角线
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

# 设定坐标轴标签和图的标题
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')

# 将图例放置在图表外的右侧
plt.legend(loc="lower right", bbox_to_anchor=(1.05, 0.5), borderaxespad=0.)

# 调整图表边界，为图例腾出空间
plt.subplots_adjust(right=0.75)

# 展示图表
plt.show()

In [None]:
# plot_roc_curve(Logit2, X_test_vif_adj_std, Y_test)
# plt.show()

# Method 2: Random Resampling Imbalanced Datasets

In [None]:
from imblearn.over_sampling import RandomOverSampler
# define oversampling strategy
ros = RandomOverSampler(random_state=seed)

In [None]:
x_train_ori_col = list(total.columns)
x_train_ori_col.remove('is_fraud')
x_train_ori_col

X_ori = total[x_train_ori_col]
Y_ori = total['is_fraud']
# print(X_ori.info())

print('BEFORE...')
print('Genuine:', Y_ori.value_counts()[0], '/', round(Y_ori.value_counts()[0]/len(Y_ori) * 100,2), '% of the dataset')
print('Frauds:', Y_ori.value_counts()[1], '/',round(Y_ori.value_counts()[1]/len(Y_ori) * 100,2), '% of the dataset')

print('AFTER...')
X_m2_over, Y_m2_over = ros.fit_resample(X_ori, Y_ori)
print('Genuine:', Y_m2_over.value_counts()[0], '/', round(Y_m2_over.value_counts()[0]/len(Y_m2_over) * 100,2), '% of the dataset')
print('Frauds:', Y_m2_over.value_counts()[1], '/',round(Y_m2_over.value_counts()[1]/len(Y_m2_over) * 100,2), '% of the dataset')

X_train, X_test, Y_train, Y_test = train_test_split(
 X_m2_over, Y_m2_over, test_size=0.3, random_state=seed)
sc= StandardScaler()

X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)

### Feature Importance Exploration

In [None]:
logit_m2_model= LogisticRegression(solver='liblinear',random_state=seed)
logit_m2_model.fit(X_train_std, Y_train)

In [None]:
feature_m2= pd.DataFrame()
feature_m2['column']= X_train_ori.columns
feature_m2['importance']= logit_m2_model.coef_[0]
feature_m2.sort_values('importance', ascending=False, inplace=True)
feature_m2

## Train

In [None]:
Logit1_m2=LogisticRegression(solver='liblinear',random_state=seed)

Logit1_m2.fit(X_train_std,Y_train)

print("Score of the model with X-train and Y-train is : ", str(round(Logit1_m2.score(X_train,Y_train)*100,2)),"%")
print("Score of the model with X-test and Y-test is : ", str(round(Logit1_m2.score(X_test,Y_test)*100,2)),"%")

Y_pred_m2=Logit1_m2.predict(X_test_std)

print( " Mean absolute error is ",( mean_absolute_error(Y_test,Y_pred_m2)))
print(" Mean squared  error is " , mean_squared_error(Y_test,Y_pred_m2))
print(" Median absolute error is " ,median_absolute_error(Y_test,Y_pred_m2)) 
print("Accuracy is " , round(accuracy_score(Y_test,Y_pred_m2)*100,2),"%")
print("F1 score: ", round(f1_score(Y_test, Y_pred_m2, average='weighted')*100,2),"%")
print("Recall:", round(recall_score(Y_test, Y_pred_m2, average='weighted') * 100, 2), "%")

In [None]:
X_train_new=X_train[[x for x in feature_m2[feature_m2["importance"]>0].column]]
X_test_new=X_test[[x for x in feature_m2[feature_m2["importance"]>0].column]]

In [None]:
X_train_m2_sm = sm.add_constant(X_train_new)
logm = sm.GLM(Y_train, X_train_m2_sm, family = sm.families.Binomial())
res = logm.fit()
res.summary()

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train_new.columns
vif['VIF'] = [variance_inflation_factor(X_train_new.values, i) for i in range(X_train_new.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# since all columns have VIF < 5 , we'll continue with all columns
#x_train_vif_adj = X_train_new[[x for x in list(vif[vif['VIF']<=5]['Features'])]]
x_train_vif_adj = X_train_new
#x_test_vif_adj = X_test_new[[x for x in list(vif[vif['VIF']<=5]['Features'])]]
x_test_vif_adj = X_test_new

In [None]:
sc= StandardScaler()
X_train_vif_adj_std=sc.fit_transform(x_train_vif_adj)
X_test_vif_adj_std = sc.fit_transform(x_test_vif_adj)

In [None]:
Logit2_m2=LogisticRegression(solver='liblinear',random_state=seed)

Logit2_m2.fit(X_train_vif_adj_std,Y_train)

print("Score of the model with X-train and Y-train is : ", str(round(Logit2_m2.score(X_train_vif_adj_std,Y_train)*100,2)),"%")
print("Score of the model with X-test and Y-test is : ", str(round(Logit2_m2.score(X_train_vif_adj_std,Y_train)*100,2)),"%")

Y_pred=Logit2_m2.predict(X_test_vif_adj_std)

print( " Mean absolute error is ",( mean_absolute_error(Y_test,Y_pred)))
print(" Mean squared  error is " , mean_squared_error(Y_test,Y_pred))
print(" Median absolute error is " ,median_absolute_error(Y_test,Y_pred)) 
print("Accuracy is " , round(accuracy_score(Y_test,Y_pred)*100,2),"%")
print("F1 score: ", round(f1_score(Y_test, Y_pred, average='weighted')*100,2),"%")
print("Recall:", round(recall_score(Y_test, Y_pred, average='weighted') * 100, 2), "%")

In [None]:
matrix = confusion_matrix(Y_test,Y_pred, labels=[1,0])
print('Confusion matrix : \n',matrix)


tp, fn, fp, tn = confusion_matrix(Y_test,Y_pred,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)


matrix = classification_report(Y_test,Y_pred,labels=[1,0])
print('Classification report : \n',matrix)

In [None]:
m2_lg_Recall = recall_score(Y_test, Y_pred, average='macro')
m2_lg_Precision = precision_score(Y_test, Y_pred, average='macro')
m2_lg_f1 = f1_score(Y_test, Y_pred, average='macro')
m2_lg_accuracy = accuracy_score(Y_test, Y_pred)
m2_lg_mae = mean_absolute_error(Y_test,Y_pred)
m2_lg_mse = mean_squared_error(Y_test,Y_pred)

m2_ndf = [(m2_lg_Recall, m2_lg_Precision, m2_lg_f1, m2_lg_accuracy,m2_lg_mae,m2_lg_mse)]
m2_lg_score = pd.DataFrame(data = m2_ndf, columns=['Recall','Precision','F1 Score', 'Accuracy','MAE','MSE'])
m2_lg_score.insert(0, 'Logistic Regression with', 'Random resampling')
m2_lg_score

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve, auc

# 假设你已经有了预测结果和真实标签，计算ROC曲线的坐标
fpr, tpr, thresholds = roc_curve(Y_test, Logit2_m2.predict_proba(X_test_vif_adj_std)[:,1])
roc_auc = auc(fpr, tpr)

# 绘制ROC曲线
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)

# 添加对角线
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

# 设定坐标轴标签和图的标题
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')

# 将图例放置在图表外的右侧
plt.legend(loc="lower right", bbox_to_anchor=(1.05, 0.5), borderaxespad=0.)

# 调整图表边界，为图例腾出空间
plt.subplots_adjust(right=0.75)

# 展示图表
plt.show()

In [None]:
# plot_roc_curve(Logit2_m2, X_test_vif_adj_std, Y_test)
# plt.show()

# Method 3: SMOTE (Synthetic Minority Oversampling Technique)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=seed)

In [None]:
x_train_ori_col = list(total.columns)
x_train_ori_col.remove('is_fraud')
x_train_ori_col

X_ori = total[x_train_ori_col]
Y_ori = total['is_fraud']
# print(X_ori.info())

print('BEFORE...')
print('Genuine:', Y_ori.value_counts()[0], '/', round(Y_ori.value_counts()[0]/len(Y_ori) * 100,2), '% of the dataset')
print('Frauds:', Y_ori.value_counts()[1], '/',round(Y_ori.value_counts()[1]/len(Y_ori) * 100,2), '% of the dataset')

print('AFTER...')
X_m3_over, Y_m3_over = smote.fit_resample(X_ori, Y_ori)
print('Genuine:', Y_m3_over.value_counts()[0], '/', round(Y_m3_over.value_counts()[0]/len(Y_m3_over) * 100,2), '% of the dataset')
print('Frauds:', Y_m3_over.value_counts()[1], '/',round(Y_m3_over.value_counts()[1]/len(Y_m3_over) * 100,2), '% of the dataset')

X_train, X_test, Y_train, Y_test = train_test_split(
 X_m3_over, Y_m3_over, test_size=0.3, random_state=seed)
sc= StandardScaler()

X_train_std = sc.fit_transform(X_train)
X_test_std = sc.fit_transform(X_test)

### Feature Importance Exploration

In [None]:
logit_m3_model= LogisticRegression(solver='liblinear',random_state=seed)
logit_m3_model.fit(X_train_std, Y_train)

In [None]:
feature_m3= pd.DataFrame()
feature_m3['column']= X_train_ori.columns
feature_m3['importance']= logit_m2_model.coef_[0]
feature_m3.sort_values('importance', ascending=False, inplace=True)
feature_m3

### Train

In [None]:
Logit1_m3=LogisticRegression(solver='liblinear',random_state=seed)

Logit1_m3.fit(X_train_std,Y_train)

print("Score of the model with X-train and Y-train is : ", str(round(Logit1_m3.score(X_train,Y_train)*100,2)),"%")
print("Score of the model with X-test and Y-test is : ", str(round(Logit1_m3.score(X_test,Y_test)*100,2)),"%")

Y_pred=Logit1_m2.predict(X_test_std)

print( " Mean absolute error is ",( mean_absolute_error(Y_test,Y_pred)))
print(" Mean squared  error is " , mean_squared_error(Y_test,Y_pred))
print(" Median absolute error is " ,median_absolute_error(Y_test,Y_pred)) 
print("Accuracy is " , round(accuracy_score(Y_test,Y_pred)*100,2),"%")
print("F1 score: ", round(f1_score(Y_test, Y_pred, average='weighted')*100,2),"%")
print("Recall:", round(recall_score(Y_test, Y_pred, average='weighted') * 100, 2), "%")

In [None]:
X_train_new=X_train[[x for x in feature_m3[feature_m3["importance"]>0].column]]
X_test_new=X_test[[x for x in feature_m3[feature_m3["importance"]>0].column]]

In [None]:
X_train_m3_sm = sm.add_constant(X_train_new)
logm = sm.GLM(Y_train, X_train_m3_sm, family = sm.families.Binomial())
res = logm.fit()
res.summary()

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train_new.columns
vif['VIF'] = [variance_inflation_factor(X_train_new.values, i) for i in range(X_train_new.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif

In [None]:
# since all columns have VIF < 5 , we'll continue with all columns
#x_train_vif_adj = X_train_new[[x for x in list(vif[vif['VIF']<=5]['Features'])]]
x_train_vif_adj = X_train_new
#x_test_vif_adj = X_test_new[[x for x in list(vif[vif['VIF']<=5]['Features'])]]
x_test_vif_adj = X_test_new

In [None]:
sc= StandardScaler()
X_train_vif_adj_std=sc.fit_transform(x_train_vif_adj)
X_test_vif_adj_std = sc.fit_transform(x_test_vif_adj)

In [None]:
Logit2_m3=LogisticRegression(solver='liblinear',random_state=seed)

Logit2_m3.fit(X_train_vif_adj_std,Y_train)

print("Score of the model with X-train and Y-train is : ", str(round(Logit2_m3.score(X_train_vif_adj_std,Y_train)*100,2)),"%")
print("Score of the model with X-test and Y-test is : ", str(round(Logit2_m3.score(X_train_vif_adj_std,Y_train)*100,2)),"%")

Y_pred=Logit2_m3.predict(X_test_vif_adj_std)

print( " Mean absolute error is ",( mean_absolute_error(Y_test,Y_pred)))
print(" Mean squared  error is " , mean_squared_error(Y_test,Y_pred))
print(" Median absolute error is " ,median_absolute_error(Y_test,Y_pred)) 
print("Accuracy is " , round(accuracy_score(Y_test,Y_pred)*100,2),"%")
print("F1 score: ", round(f1_score(Y_test, Y_pred, average='weighted')*100,2),"%")
print("Recall:", round(recall_score(Y_test, Y_pred, average='weighted') * 100, 2), "%")

In [None]:
matrix = confusion_matrix(Y_test,Y_pred, labels=[1,0])
print('Confusion matrix : \n',matrix)


tp, fn, fp, tn = confusion_matrix(Y_test,Y_pred,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)


matrix = classification_report(Y_test,Y_pred,labels=[1,0])
print('Classification report : \n',matrix)

In [None]:
m3_lg_Recall = recall_score(Y_test, Y_pred, average='macro')
m3_lg_Precision = precision_score(Y_test, Y_pred, average='macro')
m3_lg_f1 = f1_score(Y_test, Y_pred, average='macro')
m3_lg_accuracy = accuracy_score(Y_test, Y_pred)
m3_lg_mae = mean_absolute_error(Y_test,Y_pred)
m3_lg_mse = mean_squared_error(Y_test,Y_pred)

m3_ndf = [(m3_lg_Recall, m3_lg_Precision, m3_lg_f1, m3_lg_accuracy,m3_lg_mae,m3_lg_mse)]
m3_lg_score = pd.DataFrame(data = m3_ndf, columns=['Recall','Precision','F1 Score', 'Accuracy','MAE','MSE'])
m3_lg_score.insert(0, 'Logistic Regression with', 'SMOTE')
m3_lg_score

In [None]:
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve, auc

# 假设你已经有了预测结果和真实标签，计算ROC曲线的坐标
fpr, tpr, thresholds = roc_curve(Y_test, Logit2_m3.predict_proba(X_test_vif_adj_std)[:,1])
roc_auc = auc(fpr, tpr)

# 绘制ROC曲线
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)

# 添加对角线
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

# 设定坐标轴标签和图的标题
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')

# 将图例放置在图表外的右侧
plt.legend(loc="lower right", bbox_to_anchor=(1.05, 0.5), borderaxespad=0.)

# 调整图表边界，为图例腾出空间
plt.subplots_adjust(right=0.75)

# 展示图表
plt.show()

In [None]:
final_result_report = pd.concat([m0_lg_score, m1_lg_score, m2_lg_score, m3_lg_score], ignore_index=True, sort=False)
final_result_report.sort_values(by=['Recall'], ascending=False)

In [None]:
# plot_roc_curve(Logit2_m3, X_test_vif_adj_std, Y_test)
# plt.show()

# Method 4: Undersampling using Tomek Links (Future) TOO SOLW

In [None]:
# from imblearn.under_sampling import TomekLinks

# # define the undersampling method
# tomekU = TomekLinks()

In [None]:
# x_train_ori_col = list(total.columns)
# x_train_ori_col.remove('is_fraud')
# x_train_ori_col

# X_ori = total[x_train_ori_col]
# Y_ori = total['is_fraud']
# # print(X_ori.info())

# print('BEFORE...')
# print('Genuine:', Y_ori.value_counts()[0], '/', round(Y_ori.value_counts()[0]/len(Y_ori) * 100,2), '% of the dataset')
# print('Frauds:', Y_ori.value_counts()[1], '/',round(Y_ori.value_counts()[1]/len(Y_ori) * 100,2), '% of the dataset')

# print('AFTER...')
# X_underT, y_underT = tomekU.fit_resample(X_ori, Y_ori)
# print('Genuine:', y_underT.value_counts()[0], '/', round(y_underT.value_counts()[0]/len(y_underT) * 100,2), '% of the dataset')
# print('Frauds:', y_underT.value_counts()[1], '/',round(y_underT.value_counts()[1]/len(y_underT) * 100,2), '% of the dataset')

# Method 5: Combining SMOTE and Tomek Links OR BorderlineSMOTE (Future) TOO SLOW!


In [None]:
# from imblearn.over_sampling import BorderlineSMOTE
# import logging

# # 配置日志记录器
# logging.basicConfig(level=logging.INFO)

# # 假设 X_ori 和 Y_ori 是您的原始特征和目标变量
# Borderline_smote = BorderlineSMOTE(random_state=seed)

# from imblearn.combine import SMOTETomek

# # 假设 X_ori 和 Y_ori 表示您的原始特征和目标变量
# smotetomek = SMOTETomek(random_state=seed)
# X_resampled, Y_resampled = smotetomek.fit_resample(X_ori, Y_ori)



In [None]:
# x_train_ori_col = list(total.columns)
# x_train_ori_col.remove('is_fraud')
# x_train_ori_col

# X_ori = total[x_train_ori_col]
# Y_ori = total['is_fraud']
# # print(X_ori.info())

# print('BEFORE...')
# print('Genuine:', Y_ori.value_counts()[0], '/', round(Y_ori.value_counts()[0]/len(Y_ori) * 100,2), '% of the dataset')
# print('Frauds:', Y_ori.value_counts()[1], '/',round(Y_ori.value_counts()[1]/len(Y_ori) * 100,2), '% of the dataset')

# print('AFTER...')
# X_m5_over, Y_m5_over = Borderline_smote.fit_resample(X_ori, Y_ori)
# print('Genuine:', Y_m5_over.value_counts()[0], '/', round(Y_m5_over.value_counts()[0]/len(Y_m5_over) * 100,2), '% of the dataset')
# print('Frauds:', Y_m5_over.value_counts()[1], '/',round(Y_m5_over.value_counts()[1]/len(Y_m5_over) * 100,2), '% of the dataset')

# X_train, X_test, Y_train, Y_test = train_test_split(
#  Y_m5_over, Y_m5_over, test_size=0.3, random_state=seed)
# sc= StandardScaler()

# X_train_std = sc.fit_transform(X_train)
# X_test_std = sc.fit_transform(X_test)

## Decision Tree

In [None]:
# dtc = DecisionTreeClassifier()
# dtc.fit(X_train,Y_train)

### Feature Importances using Decision Tree

In [None]:
# importance = dtc.feature_importances_
# for i,v in enumerate(importance):
#     print(X_train.columns[int(i)],"- ",v)
# plt.bar([x for x in range(len(importance))], importance)
# plt.show()

In [None]:
# print("Score the X-train with Y-train is : ", dtc.score(X_train,Y_train))
# print("Score the X-test  with Y-test  is : ", dtc.score(X_test,Y_test))

# Y_pred=dtc.predict(X_test)

# print( " Mean absolute error is ", mean_absolute_error(Y_test,Y_pred))
# print(" Mean squared  error is " , mean_squared_error(Y_test,Y_pred))
# print(" Median absolute error is " ,median_absolute_error(Y_test,Y_pred)) 
# print("Accuracy score " , accuracy_score(Y_test,Y_pred))
# print("F1 score: ", round(f1_score(Y_test, Y_pred, average='weighted')*100,2),"%")

Perfect score on training set indicates overfitting. Using hyperparameter tuning.
### Hyperparameter Tuning

In [None]:
# #Normal Randomised Search takes too much time to execute on a dataset this large.
# """dtc1 = DecisionTreeClassifier()


# params_dtc = {  
#     "splitter":["best"],
#     'max_depth': [10, 20, 50, 100, 200],
#     'min_samples_leaf': [10, 20, 50, 100, 200],
#     'min_samples_split' : [10, 20, 50, 100, 200],
#     'criterion': ["gini", "entropy"]
# }
# random_search=RandomizedSearchCV(estimator=dtc1,param_distributions = params_dtc, scoring = 'f1',cv=5,n_iter=100)
# random_search.fit(X_train,Y_train)"""

Since dataset is very large(close to 1.8 million rows originally and even more after treating for unbalanced condition),
we will use halving randomized search cross validation which is an experimental variant of the randomised search, much  faster compared to either randomised search or grid search cross validation.


In [None]:
# dtc1 = DecisionTreeClassifier()

# params_dtc = {
#     'max_depth': [10, 20, 50, 100, 200],
#     'min_samples_leaf': [10, 20, 50, 100, 200],
#     'min_samples_split' : [10, 20, 50, 100, 200],
#     'criterion': ["gini", "entropy"]
# } 

# halving_random_search=HalvingRandomSearchCV(estimator=dtc1,param_distributions = params_dtc,cv=5)
# halving_random_search.fit(X_train,Y_train)
# print(halving_random_search.best_params_)

In [None]:
# print(halving_random_search.best_params_)

In [None]:
# dtc2 = DecisionTreeClassifier(min_samples_split= 100, min_samples_leaf= 20, max_depth= 200, criterion= 'gini')
# dtc2.fit(X_train,Y_train)

In [None]:
# print("Score the X-train with Y-train is : ", dtc2.score(X_train,Y_train))
# print("Score the X-test  with Y-test  is : ", dtc2.score(X_test,Y_test))

# Y_pred=dtc2.predict(X_test)

# print( " Mean absolute error is ", mean_absolute_error(Y_test,Y_pred))
# print(" Mean squared  error is " , mean_squared_error(Y_test,Y_pred))
# print(" Median absolute error is " ,median_absolute_error(Y_test,Y_pred)) 
# print("Accuracy score " , accuracy_score(Y_test,Y_pred))
# print("F1 score: ", round(f1_score(Y_test, Y_pred, average='weighted')*100,2),"%")

In [None]:
# matrix = confusion_matrix(Y_test,Y_pred, labels=[1,0])
# print('Confusion matrix : \n',matrix)


# tp, fn, fp, tn = confusion_matrix(Y_test,Y_pred,labels=[1,0]).reshape(-1)
# print('Outcome values : \n', tp, fn, fp, tn)


# matrix = classification_report(Y_test,Y_pred,labels=[1,0])
# print('Classification report : \n',matrix)

In [None]:
# plot_roc_curve(dtc2, X_test, Y_test)
# plt.show()

In accordance with the confusion matrix, the roc curve is almost perfect.