In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM

data = pd.read_csv('creditcard.csv')

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# Isolation Forest
clf_iforest = IsolationForest(contamination=0.07, random_state=42)
y_pred_iforest = clf_iforest.fit_predict(X_test)
y_pred_iforest_binary = [1 if x == 1 else 0 for x in y_pred_iforest]

# Local Outlier Factor (LOF)
clf_lof = LocalOutlierFactor(contamination=0.07)
y_pred_lof = clf_lof.fit_predict(X_test)
y_pred_lof_binary = [1 if x == 1 else 0 for x in y_pred_lof]

# Robust Covariance
clf_covariance = EllipticEnvelope(contamination=0.018,support_fraction=0.8)
y_pred_covariance = clf_covariance.fit_predict(X_test)
y_pred_covariance_binary = [1 if x == 1 else 0 for x in y_pred_covariance]

# One-Class SVM
clf_oneclasssvm = OneClassSVM(nu=0.07)
y_pred_oneclasssvm = clf_oneclasssvm.fit_predict(X_test)
y_pred_oneclasssvm_binary = [1 if x == 1 else 0 for x in y_pred_oneclasssvm]

# Evaluate the models
models = {
    'Isolation Forest': y_pred_iforest_binary,
    'Local Outlier Factor': y_pred_lof_binary,
    'Robust Covariance': y_pred_covariance_binary,
    'One-Class SVM': y_pred_oneclasssvm_binary
}

for model_name, y_pred_model in models.items():
    print(f"\n{model_name} Classification Report:\n", classification_report(y_test, y_pred_model))
    f1 = f1_score(y_test, y_pred_model)
    print(f"{model_name} F1 Score: {f1}")




Isolation Forest Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.07      0.13     56864
           1       0.00      0.12      0.00        98

    accuracy                           0.07     56962
   macro avg       0.49      0.10      0.06     56962
weighted avg       0.98      0.07      0.13     56962

Isolation Forest F1 Score: 0.0004522158577027434

Local Outlier Factor Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.07      0.13     56864
           1       0.00      0.84      0.00        98

    accuracy                           0.07     56962
   macro avg       0.50      0.45      0.07     56962
weighted avg       0.99      0.07      0.13     56962

Local Outlier Factor F1 Score: 0.00309014169430208

Robust Covariance Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.02      0.03     56864
      

In [26]:
import pandas as pd
df = pd.read_csv('creditcard.csv')
print("Shape of the dataset:", df.shape)



from sklearn.preprocessing import StandardScaler
df['Amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))
df['Time'] = StandardScaler().fit_transform(df['Time'].values.reshape(-1, 1))



from sklearn.ensemble import IsolationForest
clf = IsolationForest(n_estimators=100, max_samples='auto', contamination=float(0.01), max_features=1.0, random_state=42)
clf.fit(df)
y_pred = clf.predict(df)
y_pred = y_pred.reshape(-1,1)
print("Number of outliers:", len(df[y_pred == -1]))



from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor(n_neighbors=20, contamination=float(0.01))
y_pred = clf.fit_predict(df)
y_pred = y_pred.reshape(-1,1)
print("Number of outliers:", len(df[y_pred == -1]))




from sklearn.svm import OneClassSVM
clf = OneClassSVM(kernel='rbf', gamma=0.001, nu=0.01)
clf.fit(df)
y_pred = clf.predict(df)
y_pred = y_pred.reshape(-1,1)
print("Number of outliers:", len(df[y_pred == -1]))



from sklearn.model_selection import train_test_split
X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)




from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
classifiers = [LogisticRegression(), DecisionTreeClassifier()]
lr_params = {'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10]}
dt_params = {'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 7]}
rf_params = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7]}
knn_params = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
param_grids = [lr_params, dt_params, rf_params, knn_params]
for i, classifier in enumerate(classifiers):
    clf = GridSearchCV(classifier, param_grids[i], cv=5)
    clf.fit(X_train, y_train)
    print(classifier.__class__.__name__)
    print(clf.best_params_)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {acc}")
print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"F1 Score: {f1}")

Shape of the dataset: (284807, 31)
Number of outliers: 2849
Number of outliers: 2849
Number of outliers: 2847


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Ismat\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Ismat\AppData\Roaming\Python\Python310\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Ismat\AppData\Roaming\Python\Python310\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Ismat\AppData\Roaming\Python\Py

LogisticRegression
{'C': 10, 'penalty': 'l2'}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.88      0.63      0.74       136

    accuracy                           1.00     85443
   macro avg       0.94      0.82      0.87     85443
weighted avg       1.00      1.00      1.00     85443

DecisionTreeClassifier
{'criterion': 'entropy', 'max_depth': 5}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.91      0.79      0.85       136

    accuracy                           1.00     85443
   macro avg       0.95      0.90      0.92     85443
weighted avg       1.00      1.00      1.00     85443

Accuracy: 0.9995435553526912
Precision: 0.907563025210084
Recall: 0.7941176470588235
F1 Score: 0.8470588235294116


In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Read data
data = pd.read_csv('creditcard.csv')

# Standardize 'Amount' and 'Time'
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = StandardScaler().fit_transform(data['Time'].values.reshape(-1, 1))

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the classes
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train a classifier (e.g., RandomForest) on the resampled data
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[56856     8]
 [   17    81]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.91      0.83      0.87        98

    accuracy                           1.00     56962
   macro avg       0.95      0.91      0.93     56962
weighted avg       1.00      1.00      1.00     56962



In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import classification_report, f1_score

# Load your credit card fraud dataset (replace 'your_dataset.csv' with the actual file path)
data = pd.read_csv('creditcard.csv')

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Use LazyPredict to automatically select and evaluate classifiers
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=f1_score)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

# Display the summary of model performance
print(models)

# Manually inspect the DataFrame to identify the column containing the F1 score
# For example, if the F1 score is in a column named 'F1 Score':
f1_column_name = 'F1 Score'

# Select the best performing model based on F1 score
best_model = models.loc[models[f1_column_name].idxmax(), 'Model']

# Train the best model on the entire dataset
best_model.fit(X, y)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
print("\nBest Model Classification Report:\n",
      classification_report(y_test, y_pred))
print("\nBest Model F1 Score:", f1_score(y_test, y_pred))


 97%|█████████▋| 28/29 [21:01<01:14, 74.17s/it]  

[LightGBM] [Info] Number of positive: 394, number of negative: 227451
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 227845, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.001729 -> initscore=-6.358339
[LightGBM] [Info] Start training from score -6.358339


100%|██████████| 29/29 [21:03<00:00, 43.57s/it]


                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
QuadraticDiscriminantAnalysis      0.98               0.94     0.94      0.99   
BaggingClassifier                  1.00               0.90     0.90      1.00   
DecisionTreeClassifier             1.00               0.90     0.90      1.00   
GaussianNB                         0.98               0.90     0.90      0.99   
XGBClassifier                      1.00               0.89     0.89      1.00   
KNeighborsClassifier               1.00               0.89     0.89      1.00   
RandomForestClassifier             1.00               0.88     0.88      1.00   
NearestCentroid                    1.00               0.88     0.88      1.00   
ExtraTreesClassifier               1.00               0.88     0.88      1.00   
LinearDiscriminantAnalysis         1.00               0.87     0.87      1.00   
ExtraTreeClassifier         

KeyError: 'Model'

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
import pandas as pd
df = pd.read_csv('anomaly detection/creditcard.csv')
print("Shape of the dataset:", df.head())


df['Amount'] = StandardScaler().fit_transform(
    df['Amount'].values.reshape(-1, 1))
df['Time'] = StandardScaler().fit_transform(df['Time'].values.reshape(-1, 1))


clf = IsolationForest(  n_estimators=100, 
                        max_samples='auto', 
                        contamination=float(0.01), 
                        max_features=1.0, random_state=42)
clf.fit(df)
y_pred = clf.predict(df)
y_pred = y_pred.reshape(-1, 1)
print("Number of outliers:", len(df[y_pred == -1]))


In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, f1_score


data = pd.read_csv('anomaly detection/creditcard.csv')

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train an Isolation Forest on the original data
clf = IsolationForest(contamination=0.0001, random_state=42)
clf.fit(X_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Convert predictions to binary labels (1 for inliers, -1 for outliers)
y_pred_binary = [1 if x == 1 else 0 for x in y_pred]

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_binary))
print("\nClassification Report:\n", classification_report(y_test, y_pred_binary))

# Calculate and print the F1 score
f1 = f1_score(y_test, y_pred_binary)
print("\nF1 Score:", f1)


In [None]:
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
# define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)
# define pipeline
over = RandomOverSampler(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under), ('m', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score: %.3f' % score)


In [None]:
# example of evaluating a decision tree with random undersampling
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
# define dataset
X, y = make_classification(n_samples=10000, weights=[0.99], flip_y=0)
# define pipeline
steps = [('under', RandomUnderSampler()), ('model', DecisionTreeClassifier())]
pipeline = Pipeline(steps=steps)
# evaluate pipeline
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='f1_micro', cv=cv, n_jobs=-1)
score = mean(scores)
print('F1 Score: %.3f' % score)


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


data = pd.read_csv('creditcard.csv')

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Normalize the features
X_normalized = StandardScaler().fit_transform(X)

# Define the VAE model
latent_dim = 2  # Set the desired latent dimension

encoder_inputs = keras.Input(shape=(X.shape[1],))
x = layers.Dense(128, activation='relu')(encoder_inputs)
x = layers.Dense(64, activation='relu')(x)
z_mean = layers.Dense(latent_dim)(x)
z_log_var = layers.Dense(latent_dim)(x)


def sampling(args):
    z_mean, z_log_var = args
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon


z = layers.Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z])

# Decoder
decoder_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(64, activation='relu')(decoder_inputs)
x = layers.Dense(128, activation='relu')(x)
decoder_outputs = layers.Dense(X.shape[1], activation='sigmoid')(x)

decoder = keras.Model(decoder_inputs, decoder_outputs)

# VAE
outputs = decoder(encoder(encoder_inputs)[2])
vae = keras.Model(encoder_inputs, outputs)

# Loss function


def vae_loss(x, x_decoded_mean):
    xent_loss = tf.keras.losses.binary_crossentropy(x, x_decoded_mean)
    kl_loss = -0.5 * \
        tf.reduce_sum(1 + z_log_var - tf.square(z_mean) -
                      tf.exp(z_log_var), axis=-1)
    return xent_loss + kl_loss


# Compile the VAE model
vae.compile(optimizer='adam', loss=vae_loss)

# Train the VAE model
vae.fit(X_normalized, X_normalized, epochs=10,
        batch_size=32, shuffle=True, validation_split=0.2)

# Encode the input data to get latent representations
encoded_data = encoder.predict(X_normalized)[2]
