In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

# Load data
data = pd.read_csv('creditcard.csv')

data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = StandardScaler().fit_transform(data['Time'].values.reshape(-1, 1))

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Under-sampling with Logistic Regression
under_sampling_pipeline = Pipeline([
    ('under_sampler', RandomUnderSampler(sampling_strategy='auto', random_state=42)),
    ('classifier', LogisticRegression(random_state=42,C= 10, penalty= 'l2'))
])

under_sampling_pipeline.fit(X_train, y_train)
y_pred_under = under_sampling_pipeline.predict(X_test)

# Evaluate the model
print("Random Under-sampling with Logistic Regression:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_under))
print("\nClassification Report:\n", classification_report(y_test, y_pred_under))

# Random Over-sampling with Logistic Regression
over_sampling_pipeline = Pipeline([
    ('over_sampler', RandomOverSampler(sampling_strategy='auto', random_state=42)),
    ('classifier', LogisticRegression(random_state=42,C= 10, penalty= 'l2'))
])
over_sampling_pipeline.fit(X_train, y_train)
y_pred_over = over_sampling_pipeline.predict(X_test)
# Evaluate the model
print("\nRandom Over-sampling with Logistic Regression:")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_over))
print("\nClassification Report:\n", classification_report(y_test, y_pred_over))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Under-sampling with Logistic Regression:
Confusion Matrix:
 [[54253  2611]
 [    7    91]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.98     56864
           1       0.03      0.93      0.07        98

    accuracy                           0.95     56962
   macro avg       0.52      0.94      0.52     56962
weighted avg       1.00      0.95      0.97     56962


Random Over-sampling with Logistic Regression:
Confusion Matrix:
 [[55515  1349]
 [    8    90]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     56864
           1       0.06      0.92      0.12        98

    accuracy                           0.98     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.98      0.99     56962



In [8]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix,f1_score

# Load data
data = pd.read_csv('creditcard.csv')

data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = StandardScaler().fit_transform(data['Time'].values.reshape(-1, 1))

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Decision Tree Classifier with specified parameters
clf = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42)
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1')

clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Decision Tree Classifier without Sampling:")
print("Cross-Validation F1 Scores:", cv_scores)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred)
print("\nF1 Score:", f1)

Decision Tree Classifier without Sampling:
Cross-Validation F1 Scores: [0.81818182 0.7972028  0.79710145 0.83783784 0.8496732 ]
Confusion Matrix:
 [[56860     4]
 [   31    67]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.94      0.68      0.79        98

    accuracy                           1.00     56962
   macro avg       0.97      0.84      0.90     56962
weighted avg       1.00      1.00      1.00     56962


F1 Score: 0.7928994082840236


In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv('creditcard.csv')

# Standardize 'Amount' and 'Time'
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = StandardScaler().fit_transform(data['Time'].values.reshape(-1, 1))

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier with specified parameters
clf = RandomForestClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1')

clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print("Random Forest Classifier:")
print("Cross-Validation F1 Scores:", cv_scores)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred)
print("\nF1 Score:", f1)

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import IsolationForest

# Load data
data = pd.read_csv('creditcard.csv')

# Standardize 'Amount' and 'Time'
data['Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data['Time'] = StandardScaler().fit_transform(data['Time'].values.reshape(-1, 1))

# Separate features and target variable
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Isolation Forest
clf = IsolationForest(contamination=0.01, random_state=42)
clf.fit(X_train)

# Predict outliers on the test set
y_pred = clf.predict(X_test)

# Convert predictions to 0 (normal) and 1 (anomaly)
y_pred[y_pred == 1] = 0  # Inliers are labeled as 1
y_pred[y_pred == -1] = 1  # Outliers are labeled as -1

# Evaluate the model
print("Isolation Forest Anomaly Detection:")
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Isolation Forest Anomaly Detection:
F1 Score: 0.1756373937677054

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     56864
           1       0.10      0.63      0.18        98

    accuracy                           0.99     56962
   macro avg       0.55      0.81      0.59     56962
weighted avg       1.00      0.99      0.99     56962

