In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.covariance import EllipticEnvelope

# Load data
data = pd.read_csv('creditcard.csv')

# Standardize 'Amount' and 'Time'
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = StandardScaler().fit_transform(data['Time'].values.reshape(-1, 1))

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

outlier_detector = EllipticEnvelope(contamination=0.01)
outliers = outlier_detector.fit_predict(X)
X_no_outliers = X[outliers == 1]
y_no_outliers = y[outliers == 1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_no_outliers, y_no_outliers, test_size=0.2, random_state=42)

# Train a Decision Tree Classifier with specified parameters
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42)
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1')

clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Decision Tree Classifier without Sampling (after removing outliers):")
print("Cross-Validation F1 Scores:", cv_scores)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred)
print("\nF1 Score:", f1)



Decision Tree Classifier without Sampling (after removing outliers):
Cross-Validation F1 Scores: [0.81188119 0.84       0.72340426 0.72527473 0.74      ]
Confusion Matrix:
 [[56319     6]
 [   20    47]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56325
           1       0.89      0.70      0.78        67

    accuracy                           1.00     56392
   macro avg       0.94      0.85      0.89     56392
weighted avg       1.00      1.00      1.00     56392


F1 Score: 0.7833333333333334


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv('creditcard.csv')

# Standardize 'Amount' and 'Time'
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = StandardScaler().fit_transform(data['Time'].values.reshape(-1, 1))

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Use Isolation Forest to identify and remove outliers
outlier_detector = IsolationForest(contamination=0.01, random_state=42)
outliers = outlier_detector.fit_predict(X)
X_no_outliers = X[outliers == 1]
y_no_outliers = y[outliers == 1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_no_outliers, y_no_outliers, test_size=0.2, random_state=42)

# Train a Decision Tree Classifier with specified parameters
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Decision Tree Classifier without Sampling (after removing outliers with Isolation Forest):")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred)
print("\nF1 Score:", f1)

Decision Tree Classifier without Sampling (after removing outliers with Isolation Forest):
Confusion Matrix:
 [[56349     4]
 [   16    23]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56353
           1       0.85      0.59      0.70        39

    accuracy                           1.00     56392
   macro avg       0.93      0.79      0.85     56392
weighted avg       1.00      1.00      1.00     56392


F1 Score: 0.6969696969696971


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

# Load data
data = pd.read_csv('creditcard.csv')

# Standardize 'Amount' and 'Time'
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = StandardScaler().fit_transform(data['Time'].values.reshape(-1, 1))

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Use Isolation Forest to identify and remove outliers
outlier_detector = IsolationForest(contamination=0.01, random_state=42)
outliers = outlier_detector.fit_predict(X)
X_no_outliers = X[outliers == 1]
y_no_outliers = y[outliers == 1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_no_outliers, y_no_outliers, test_size=0.2, random_state=42)

# Apply random under-sampling to balance the classes
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# Train a Decision Tree Classifier on the resampled data
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42)
clf.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Decision Tree Classifier with Random Under-sampling and Outlier Removal:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred)
print("\nF1 Score:", f1)

Decision Tree Classifier with Random Under-sampling and Outlier Removal:
Confusion Matrix:
 [[47791  8562]
 [    3    36]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.85      0.92     56353
           1       0.00      0.92      0.01        39

    accuracy                           0.85     56392
   macro avg       0.50      0.89      0.46     56392
weighted avg       1.00      0.85      0.92     56392


F1 Score: 0.008336227856894755


In [4]:
import pandas as pd
df = pd.read_csv('creditcard.csv')
print("Shape of the dataset:", df.shape)



from sklearn.preprocessing import StandardScaler
df['Amount'] = StandardScaler().fit_transform(df['Amount'].values.reshape(-1, 1))
df['Time'] = StandardScaler().fit_transform(df['Time'].values.reshape(-1, 1))

from sklearn.model_selection import train_test_split
X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
classifiers = [LogisticRegression(), DecisionTreeClassifier()]
lr_params = {'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10]}
dt_params = {'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, 7]}
rf_params = {'n_estimators': [100, 300, 500], 'max_depth': [3, 5, 7]}
knn_params = {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance']}
param_grids = [lr_params, dt_params, rf_params, knn_params]
for i, classifier in enumerate(classifiers):
    clf = GridSearchCV(classifier, param_grids[i], cv=5)
    clf.fit(X_train, y_train)
    print(classifier.__class__.__name__)
    print(clf.best_params_)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {acc}")
print(f"Precision: {prec}")
print(f"Recall: {rec}")
print(f"F1 Score: {f1}")


Shape of the dataset: (284807, 31)


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Ismat\AppData\Roaming\Python\Python310\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Ismat\AppData\Roaming\Python\Python310\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Ismat\AppData\Roaming\Python\Python310\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\Ismat\AppData\Roaming\Python\Py

LogisticRegression
{'C': 10, 'penalty': 'l2'}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.88      0.63      0.74       136

    accuracy                           1.00     85443
   macro avg       0.94      0.82      0.87     85443
weighted avg       1.00      1.00      1.00     85443

DecisionTreeClassifier
{'criterion': 'entropy', 'max_depth': 5}
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85307
           1       0.91      0.81      0.86       136

    accuracy                           1.00     85443
   macro avg       0.95      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443

Accuracy: 0.9995669627705019
Precision: 0.9090909090909091
Recall: 0.8088235294117647
F1 Score: 0.8560311284046692


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
data = pd.read_csv('creditcard.csv')
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = StandardScaler().fit_transform(data['Time'].values.reshape(-1, 1))
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
base_classifier = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42)
bagging_classifier = BaggingClassifier(base_classifier, n_estimators=10, random_state=42)
bagging_classifier.fit(X_train, y_train)
y_pred = bagging_classifier.predict(X_test)
print("BaggingClassifier with Decision Tree base estimator:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred)
print("\nF1 Score:", f1)

BaggingClassifier with Decision Tree base estimator:
Confusion Matrix:
 [[56857     7]
 [   19    79]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.92      0.81      0.86        98

    accuracy                           1.00     56962
   macro avg       0.96      0.90      0.93     56962
weighted avg       1.00      1.00      1.00     56962


F1 Score: 0.858695652173913


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv('creditcard.csv')

# Standardize 'Amount' and 'Time'
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = StandardScaler().fit_transform(data['Time'].values.reshape(-1, 1))

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a base Decision Tree Classifier
DecisionTreeClassifier = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42)
# Fit the BaggingClassifier on the training data
DecisionTreeClassifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = DecisionTreeClassifier.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred)
print("\nF1 Score:", f1)

BaggingClassifier with Decision Tree base estimator:
Confusion Matrix:
 [[56860     4]
 [   31    67]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.94      0.68      0.79        98

    accuracy                           1.00     56962
   macro avg       0.97      0.84      0.90     56962
weighted avg       1.00      1.00      1.00     56962


F1 Score: 0.7928994082840236
