In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv("creditcard.csv")
data

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.shape

In [None]:
data.isnull().values.any()

In [None]:
percent_missing = (data.isnull().sum().sort_values(ascending = False)/len(data))*100
print(percent_missing)

In [None]:
data.duplicated(keep=False).sum()

In [None]:
data = data.drop_duplicates() 

In [None]:
# Time statistical summary across fraud and not fraud transactions.
print ("Fraud")
print (data.Time[data.Class == 1].describe())
print ()
print ("Not Fraud")
print (data.Time[data.Class == 0].describe())

In [None]:
plt.title('The proportion of fraudulent vs non-fraudulent transactions')
data['Class'].value_counts().plot(kind='pie', autopct="%.2f%%", labels=['Not Fraud','Fraud'],
                                        startangle = 90, colors = ['#C35617', '#FFDEAD'])
plt.show();
print(data.Class.value_counts())

In [None]:
f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(12,4))
bins = 50

ax1.hist(data.Time[data.Class == 1], bins = bins)
ax1.set_title('Fraud')

ax2.hist(data.Time[data.Class == 0], bins = bins)
ax2.set_title('Not Fraud')

plt.xlabel('Time (Sec.)')
plt.ylabel('Number of Transactions')
plt.show();

In [None]:
var = data.columns.values

i = 0
t0 = data.loc[data['Class'] == 0]
t1 = data.loc[data['Class'] == 1]

sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(8,4, figsize=(16,28))

for feature in var[0:-1]:
    i += 1
    plt.subplot(8,4,i)
    sns.kdeplot(t0[feature], bw=0.5,label="Class = 0")
    sns.kdeplot(t1[feature], bw=0.5,label="Class = 1")
    plt.xlabel(feature, fontsize=12, labelpad=-4)
    locs, labels = plt.xticks()
    plt.tick_params(axis='both', which='major', labelsize=12)
plt.show();

In [None]:
#DATA TRANSFORMATION

In [None]:
corr_matrix = data.corr(method = "pearson" )
corr_matrix.style.background_gradient(cmap='Oranges')

In [None]:
cor_target = data.corrwith(data["Class"])
cor_target.sort_values(axis = 0, ascending = False)

In [None]:
#outliers treatment

In [None]:
# Creating fraudulent dataframe
data_fraud = data[data['Class'] == 1]
# Creating non fraudulent dataframe
data_non_fraud = data[data['Class'] == 0]

In [None]:
# Distribution plot
plt.figure(figsize=(16,9))
ax = sns.distplot(data_fraud['Time'], label='fraudulent', hist=False)
ax = sns.distplot(data_non_fraud['Time'], label='non fraudulent', hist=False)
ax.set(xlabel = 'Seconds elapsed between the transction and the first transction')
plt.legend()
plt.show();

In [None]:
# Distribution plot
plt.figure(figsize=(16, 9))
ax = sns.kdeplot(data_fraud['Time'], label='fraudulent', fill = True, color = '#6A287E')
ax = sns.kdeplot(data_non_fraud['Time'], label='non fraudulent', fill = True, color = '#F87217')
ax.set(xlabel='Seconds elapsed between the transction and the first transction')
plt.legend()
plt.show();

In [None]:
# Dropping the Time column
data.drop('Time', axis=1, inplace=True)

In [None]:
# Distribution plot
plt.figure(figsize=(16,9))
ax = sns.distplot(data_fraud['Amount'], label='fraudulent', hist=False)
ax = sns.distplot(data_non_fraud['Time'], label='non fraudulent', hist=False)
ax.set(xlabel='Transction Amount')
plt.legend()
plt.show();

In [None]:
#Model on imbalanced data

In [None]:
#train and test

In [None]:
# Putting feature variables into X
X = data.drop(['Class'], axis=1)

# Putting target variable to y
y = data['Class']
# Import library
from sklearn.model_selection import train_test_split
# Splitting data into train and test set 80:20
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size = 0.8, 
                                                    test_size = 0.2, 
                                                    random_state = 42)

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

In [None]:
#Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_train['Amount'] = scaler.fit_transform(X_train[['Amount']])

In [None]:
X_train.head()

In [None]:
X_test['Amount'] = scaler.transform(X_test[['Amount']])
X_test.head()

In [None]:
cols = X_train.columns
cols

In [None]:
# Plotting the distribution of the variables (skewness) of all the columns
k = 0
plt.figure(figsize=(17,28))
for col in cols :    
    k = k + 1
    plt.subplot(6, 5,k)    
    sns.distplot(X_train[col])
    plt.title(col+' '+str(X_train[col].skew()))

In [None]:
# Importing PowerTransformer
from sklearn.preprocessing import PowerTransformer
# Instantiate the powertransformer
pt = PowerTransformer(method='yeo-johnson', standardize=True, copy=False)
# Fit and transform the PT on training data
X_train[cols] = pt.fit_transform(X_train)

In [None]:
# Transform the test set
X_test[cols] = pt.transform(X_test)
# Plotting the distribution of the variables (skewness) of all the columns
k=0
plt.figure(figsize=(17,28))
for col in cols :    
    k=k+1
    plt.subplot(6, 5,k)    
    sns.distplot(X_train[col])
    plt.title(col+' '+str(X_train[col].skew()))

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
def Visualize_confusion_matrix(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(7, 5))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Oranges',
                xticklabels=['No Credit Card Fraud Dection','Credit Card Fraud Dection'], 
                yticklabels=['No Credit Card Fraud Dection','Credit Card Fraud Dection'])
    plt.title('Accuracy: {0:.4f}'.format(accuracy_score(y_test, y_pred)))
    plt.ylabel('True Values')
    plt.xlabel('Predicted Values')
    plt.show()
    
    print("\n")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    return

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

def ROC_AUC(Y, Y_prob):
    # caculate roc curves
    fpr, tpr, threshold = roc_curve(Y, Y_prob)
    # caculate scores
    model_auc = roc_auc_score(Y, Y_prob)
    # plot roc curve for the model
    plt.figure(figsize=(16, 9))
    plt.plot([0,1], [0,1], linestyle='--', label='No Skill')
    plt.plot(fpr, tpr, marker='.', label='Model - AUC=%.3f' % (model_auc))
    # show axis labels and the legend
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show(block=False)
    return

In [None]:
#KNN on training set

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN_model = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
KNN_model.fit(X_train, y_train)
y_train_pred = KNN_model.predict(X_train)
y_test_pred = KNN_model.predict(X_test)
acc3 = accuracy_score(y_test, y_test_pred)

In [None]:
# Train Score
print('Recall score: %0.4f'% recall_score(y_train, y_train_pred))
print('Precision score: %0.4f'% precision_score(y_train, y_train_pred))
print('F1-Score: %0.4f'% f1_score(y_train, y_train_pred))
print('Accuracy score: %0.4f'% accuracy_score(y_train, y_train_pred))
print('AUC: %0.4f' % roc_auc_score(y_train, y_train_pred))

In [None]:
Visualize_confusion_matrix(y_train, y_train_pred)

In [None]:
ROC_AUC(y_train, y_train_pred)

In [None]:
#Training the KNN Model on the Testing set

In [None]:
# Test score
print('Recall score: %0.4f'% recall_score(y_test, y_test_pred))
print('Precision score: %0.4f'% precision_score(y_test, y_test_pred))
print('F1-Score: %0.4f'% f1_score(y_test, y_test_pred))
print('Accuracy score: %0.4f'% accuracy_score(y_test, y_test_pred))
print('AUC: %0.4f' % roc_auc_score(y_test, y_test_pred))

In [None]:
# Test Predictions
Visualize_confusion_matrix(y_test, y_test_pred)

In [None]:
ROC_AUC(y_test, y_test_pred)

In [None]:
from sklearn.metrics import roc_curve, auc

In [None]:
y_pred_knn = KNN_model.predict_proba(X_test)[:,1]
knn_fpr, knn_tpr, threshold = roc_curve(y_test, y_pred_knn)
auc_knn = auc(knn_fpr, knn_tpr)

In [None]:
plt.figure(figsize=(10, 8), dpi=100)
plt.plot([0, 1], [0, 1], 'k--')
# KNN
plt.plot(knn_fpr, knn_tpr, label='KNN (auc = %0.4f)' % auc_knn)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.legend(loc='best')
plt.show();