# import 

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Loading Data

In [8]:
credit_card_data=pd.read_csv(r"creditcard.csv")
'''print(credit_card_data.info())'''
print(credit_card_data.head())
print(credit_card_data.tail())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

# Data Preprocessing and Analysis

#CHECKING THE MISSING VALUES IN THE EACH COLUMN

In [9]:
print(credit_card_data.isnull().sum())

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


#DISTRIBUTION OF LEGIT(AUTHORIZED) TRANSACTIONS AND FRADULENT TRANSACTIONS

In [10]:
''' 0 represents the normal transactions and 1 represents the fraudlent transactions'''
print(credit_card_data['Class'].value_counts())

Class
0    284315
1       492
Name: count, dtype: int64


#SEPARATING THE DATA FOR ANALYSIS

In [11]:
legit=credit_card_data[credit_card_data.Class==0]
fraudlent=credit_card_data[credit_card_data.Class==1]
'''print(legit)'''
'''print(fraudlent)'''
print("Shape of legit is :",legit.shape)
print("Shape of fraudlent is :",fraudlent.shape)

Shape of legit is : (284315, 31)
Shape of fraudlent is : (492, 31)


#STATISTICAL MEASURES OF THE DATA LEGIT

In [12]:
print("Legit Statistical Measures:")
print(legit.Amount.describe())
print("Fraudlent Statistical Measures:")
print(fraudlent.Amount.describe())

Legit Statistical Measures:
count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64
Fraudlent Statistical Measures:
count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64


COMPARE THE VALUES FOR BOTH TRANSACTIONS

In [13]:
print(credit_card_data.groupby('Class').mean())

               Time        V1        V2        V3        V4        V5  \
Class                                                                   
0      94838.202258  0.008258 -0.006271  0.012171 -0.007860  0.005453   
1      80746.806911 -4.771948  3.623778 -7.033281  4.542029 -3.151225   

             V6        V7        V8        V9  ...       V20       V21  \
Class                                          ...                       
0      0.002419  0.009637 -0.000987  0.004467  ... -0.000644 -0.001235   
1     -1.397737 -5.568731  0.570636 -2.581123  ...  0.372319  0.713588   

            V22       V23       V24       V25       V26       V27       V28  \
Class                                                                         
0     -0.000024  0.000070  0.000182 -0.000072 -0.000089 -0.000295 -0.000131   
1      0.014049 -0.040308 -0.105130  0.041449  0.051648  0.170575  0.075667   

           Amount  
Class              
0       88.291022  
1      122.211321  

[2 rows x 30

UNDER SAMPLING

In [14]:
#Build the similar datasets of legit and fraudlent
#number of features in legit_sample-492
legit_sample=legit.sample(n=492)

#CONCATENATING TWO DATA FRAMES
new_dataset = pd.concat([legit_sample, fraudlent], axis=0)
print(new_dataset.head())

new_dataset['Class'].value_counts()
new_dataset.groupby('Class').mean()

            Time        V1        V2        V3        V4        V5        V6  \
101598   67889.0 -0.368669  1.095595  1.270223  0.071694 -0.019542 -0.946124   
251429  155344.0 -0.820185  0.765186  0.568137 -0.521165  0.690793 -0.124059   
73800    55285.0 -0.614611 -0.713327  1.756478 -2.087375 -1.096341  0.286983   
3896      3469.0 -0.054135  0.851364  0.372174  0.627681 -0.302748 -0.380554   
52306    45379.0 -0.358544  0.563827  1.203042 -1.469831  0.008371 -0.398790   

              V7        V8        V9  ...       V21       V22       V23  \
101598  0.792314 -0.087997 -0.424339  ... -0.251287 -0.678663  0.027646   
251429  1.155357  0.081171 -0.408325  ... -0.141639 -0.522758 -0.182508   
73800   0.611129 -0.102421 -0.443961  ... -0.449004 -0.717299  0.228392   
3896    0.063992  0.386638  0.017641  ... -0.049529 -0.060215  0.177210   
52306   0.343772  0.211678  0.064718  ... -0.113997 -0.367173 -0.110065   

             V24       V25       V26       V27       V28  Amount  Cl

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,98186.963415,0.282766,0.019957,-0.014089,0.096275,-0.073843,-0.028649,0.044029,0.069345,-0.029788,...,-0.005592,0.010877,0.064902,-0.016901,-0.067996,-0.034176,0.008711,0.001686,0.007176,89.786199
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


# Splitting Data

#SPLITTING THE DATA INTO FEATURES AND TARGETS

In [15]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']
print(X)
print(Y)

            Time        V1        V2        V3        V4        V5        V6  \
101598   67889.0 -0.368669  1.095595  1.270223  0.071694 -0.019542 -0.946124   
251429  155344.0 -0.820185  0.765186  0.568137 -0.521165  0.690793 -0.124059   
73800    55285.0 -0.614611 -0.713327  1.756478 -2.087375 -1.096341  0.286983   
3896      3469.0 -0.054135  0.851364  0.372174  0.627681 -0.302748 -0.380554   
52306    45379.0 -0.358544  0.563827  1.203042 -1.469831  0.008371 -0.398790   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

#SPLITING THE DATA INTO TRANING AND TESTING

In [16]:

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

(984, 30) (787, 30) (197, 30)


# Model Building

#LOGISTIC REGRESSION


In [17]:
model1 = LogisticRegression(max_iter=200)
model1.fit(X_train, Y_train)
ypred=model1.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Model Evaluation

In [18]:

#print("Using Logistic Regression :",accuracy_score(Y_test,ypred)*100)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Assuming y_test is the true labels and y_pred is the predicted labels from your model
# Replace Y_test and ypred with your variable names if different

# Accuracy
accuracy = accuracy_score(Y_test, ypred) * 100
print(f"Accuracy: {accuracy:.2f}%")

# Precision
precision = precision_score(Y_test, ypred)
print(f"Precision: {precision:.2f}")

# Recall
recall = recall_score(Y_test, ypred)
print(f"Recall: {recall:.2f}")

# F1 Score
f1 = f1_score(Y_test, ypred)
print(f"F1 Score: {f1:.2f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(Y_test, ypred))
#




Accuracy: 92.89%
Precision: 0.97
Recall: 0.89
F1 Score: 0.93

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.97      0.93        99
           1       0.97      0.89      0.93        98

    accuracy                           0.93       197
   macro avg       0.93      0.93      0.93       197
weighted avg       0.93      0.93      0.93       197



In [19]:

# test

In [20]:

#print("Using Logistic Regression :",accuracy_score(Y_test,ypred)*100)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Assuming y_test is the true labels and y_pred is the predicted labels from your model
# Replace Y_test and ypred with your variable names if different

# Accuracy
accuracy = accuracy_score(Y_test, ypred) * 100
print(f"Accuracy: {accuracy:.2f}%")

# Precision
precision = precision_score(Y_test, ypred)
print(f"Precision: {precision:.2f}")

# Recall
recall = recall_score(Y_test, ypred)
print(f"Recall: {recall:.2f}")

# F1 Score
f1 = f1_score(Y_test, ypred)
print(f"F1 Score: {f1:.2f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(Y_test, ypred))
#




Accuracy: 92.89%
Precision: 0.97
Recall: 0.89
F1 Score: 0.93

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.97      0.93        99
           1       0.97      0.89      0.93        98

    accuracy                           0.93       197
   macro avg       0.93      0.93      0.93       197
weighted avg       0.93      0.93      0.93       197



# SMOTE

In [21]:


X = credit_card_data.drop(columns='Class', axis=1)
Y = credit_card_data['Class']

#print(X)
#print(Y)

# SPLITTING THE DATA INTO TRAINING AND TESTING
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
print(X.shape, X_train.shape, X_test.shape)
from imblearn.over_sampling import SMOTE
# APPLYING OVERSAMPLING WITH SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, Y_train_resampled = smote.fit_resample(X_train, Y_train)

print("Before SMOTE:")
print(Y_train.value_counts())
print("After SMOTE:")
print(pd.Series(Y_train_resampled).value_counts())

# MODEL TRAINING
print("Accuracy Scores:")

# LOGISTIC REGRESSION
model1 = LogisticRegression(max_iter=200)
model1.fit(X_train_resampled, Y_train_resampled)
ypred = model1.predict(X_test)

# ACCURACY AND OTHER METRICS
print("Using Logistic Regression:", accuracy_score(Y_test, ypred) * 100)
print("Classification Report:\n", classification_report(Y_test, ypred))




(284807, 30) (227845, 30) (56962, 30)
Before SMOTE:
Class
0    227451
1       394
Name: count, dtype: int64
After SMOTE:
Class
0    227451
1    227451
Name: count, dtype: int64
Accuracy Scores:
Using Logistic Regression: 98.68157719181208
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     56864
           1       0.11      0.93      0.20        98

    accuracy                           0.99     56962
   macro avg       0.55      0.96      0.59     56962
weighted avg       1.00      0.99      0.99     56962



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# RandomUnderSAmple


In [22]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# RANDOM UNDERSAMPLING
rus = RandomUnderSampler(random_state=42)
X_rus, Y_rus = rus.fit_resample(X_train_scaled, Y_train)

print("\n=== Random Undersampling with Scaling ===")

# Logistic Regression
log_reg = LogisticRegression(max_iter=200, random_state=42)
log_reg.fit(X_rus, Y_rus)
log_pred = log_reg.predict(X_test_scaled)
print("\nLogistic Regression")
print(f"Accuracy: {accuracy_score(Y_test, log_pred):.4f}")
print(classification_report(Y_test, log_pred))

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_rus, Y_rus)
rf_pred = rf.predict(X_test_scaled)
print("\nRandom Forest")
print(f"Accuracy: {accuracy_score(Y_test, rf_pred):.4f}")
print(classification_report(Y_test, rf_pred))



=== Random Undersampling with Scaling ===

Logistic Regression
Accuracy: 0.9646
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56864
           1       0.04      0.95      0.08        98

    accuracy                           0.96     56962
   macro avg       0.52      0.96      0.53     56962
weighted avg       1.00      0.96      0.98     56962


Random Forest
Accuracy: 0.9626
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56864
           1       0.04      0.93      0.08        98

    accuracy                           0.96     56962
   macro avg       0.52      0.95      0.53     56962
weighted avg       1.00      0.96      0.98     56962



# TomekLinks

In [23]:
from imblearn.under_sampling import TomekLinks
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# TOMEK LINKS
tomek = TomekLinks()
X_tomek, Y_tomek = tomek.fit_resample(X_train, Y_train)

print("\n=== TokenLink with Scaling ===")

# Logistic Regression
log_reg = LogisticRegression(max_iter=200, random_state=42)
log_reg.fit(X_rus, Y_rus)
log_pred = log_reg.predict(X_test_scaled)
print("\nLogistic Regression")
print(f"Accuracy: {accuracy_score(Y_test, log_pred):.4f}")
print(classification_report(Y_test, log_pred))

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_rus, Y_rus)
rf_pred = rf.predict(X_test_scaled)
print("\nRandom Forest")
print(f"Accuracy: {accuracy_score(Y_test, rf_pred):.4f}")
print(classification_report(Y_test, rf_pred))




=== TokenLink with Scaling ===

Logistic Regression
Accuracy: 0.9646
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56864
           1       0.04      0.95      0.08        98

    accuracy                           0.96     56962
   macro avg       0.52      0.96      0.53     56962
weighted avg       1.00      0.96      0.98     56962


Random Forest
Accuracy: 0.9626
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56864
           1       0.04      0.93      0.08        98

    accuracy                           0.96     56962
   macro avg       0.52      0.95      0.53     56962
weighted avg       1.00      0.96      0.98     56962



# ClusterCentroids

In [24]:
from imblearn.under_sampling import ClusterCentroids
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# CLUSTER CENTROIDS
cc = ClusterCentroids(random_state=42)
X_cc, Y_cc = cc.fit_resample(X_train, Y_train)

print("\n=== Cluster Centroids ===")

# Logistic Regression
log_reg = LogisticRegression(max_iter=200, random_state=42)
log_reg.fit(X_rus, Y_rus)
log_pred = log_reg.predict(X_test_scaled)
print("\nLogistic Regression")
print(f"Accuracy: {accuracy_score(Y_test, log_pred):.4f}")
print(classification_report(Y_test, log_pred))

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_rus, Y_rus)
rf_pred = rf.predict(X_test_scaled)
print("\nRandom Forest")
print(f"Accuracy: {accuracy_score(Y_test, rf_pred):.4f}")
print(classification_report(Y_test, rf_pred))




=== Cluster Centroids ===

Logistic Regression
Accuracy: 0.9646
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56864
           1       0.04      0.95      0.08        98

    accuracy                           0.96     56962
   macro avg       0.52      0.96      0.53     56962
weighted avg       1.00      0.96      0.98     56962


Random Forest
Accuracy: 0.9626
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56864
           1       0.04      0.93      0.08        98

    accuracy                           0.96     56962
   macro avg       0.52      0.95      0.53     56962
weighted avg       1.00      0.96      0.98     56962



# 4 SMOTETomek

In [25]:
from imblearn.combine import SMOTETomek
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



# SMOTE + TOMEK LINKS
smote_tomek = SMOTETomek(random_state=42)
X_smote_tomek, Y_smote_tomek = smote_tomek.fit_resample(X_train, Y_train)

print("\n=== SMOTE + Tomek Links ===")

# Logistic Regression
log_reg = LogisticRegression(max_iter=200, random_state=42)
log_reg.fit(X_rus, Y_rus)
log_pred = log_reg.predict(X_test_scaled)
print("\nLogistic Regression")
print(f"Accuracy: {accuracy_score(Y_test, log_pred):.4f}")
print(classification_report(Y_test, log_pred))

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_rus, Y_rus)
rf_pred = rf.predict(X_test_scaled)
print("\nRandom Forest")
print(f"Accuracy: {accuracy_score(Y_test, rf_pred):.4f}")
print(classification_report(Y_test, rf_pred))




=== SMOTE + Tomek Links ===

Logistic Regression
Accuracy: 0.9646
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56864
           1       0.04      0.95      0.08        98

    accuracy                           0.96     56962
   macro avg       0.52      0.96      0.53     56962
weighted avg       1.00      0.96      0.98     56962


Random Forest
Accuracy: 0.9626
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56864
           1       0.04      0.93      0.08        98

    accuracy                           0.96     56962
   macro avg       0.52      0.95      0.53     56962
weighted avg       1.00      0.96      0.98     56962



# 5 Random Oversampling

In [26]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# RANDOM OVERSAMPLING
ros = RandomOverSampler(random_state=42)
X_ros, Y_ros = ros.fit_resample(X_train, Y_train)

print("\n=== Random Oversampling ===")

# Logistic Regression
log_reg = LogisticRegression(max_iter=200, random_state=42)
log_reg.fit(X_rus, Y_rus)
log_pred = log_reg.predict(X_test_scaled)
print("\nLogistic Regression")
print(f"Accuracy: {accuracy_score(Y_test, log_pred):.4f}")
print(classification_report(Y_test, log_pred))

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_rus, Y_rus)
rf_pred = rf.predict(X_test_scaled)
print("\nRandom Forest")
print(f"Accuracy: {accuracy_score(Y_test, rf_pred):.4f}")
print(classification_report(Y_test, rf_pred))




=== Random Oversampling ===

Logistic Regression
Accuracy: 0.9646
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56864
           1       0.04      0.95      0.08        98

    accuracy                           0.96     56962
   macro avg       0.52      0.96      0.53     56962
weighted avg       1.00      0.96      0.98     56962


Random Forest
Accuracy: 0.9626
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56864
           1       0.04      0.93      0.08        98

    accuracy                           0.96     56962
   macro avg       0.52      0.95      0.53     56962
weighted avg       1.00      0.96      0.98     56962



# 6. SMOTE (Oversampling)

In [27]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# SMOTE
smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, Y_train)

print("\n=== SMOTE ===")

# Logistic Regression
log_reg = LogisticRegression(max_iter=200, random_state=42)
log_reg.fit(X_rus, Y_rus)
log_pred = log_reg.predict(X_test_scaled)
print("\nLogistic Regression")
print(f"Accuracy: {accuracy_score(Y_test, log_pred):.4f}")
print(classification_report(Y_test, log_pred))

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_rus, Y_rus)
rf_pred = rf.predict(X_test_scaled)
print("\nRandom Forest")
print(f"Accuracy: {accuracy_score(Y_test, rf_pred):.4f}")
print(classification_report(Y_test, rf_pred))




=== SMOTE ===

Logistic Regression
Accuracy: 0.9646
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56864
           1       0.04      0.95      0.08        98

    accuracy                           0.96     56962
   macro avg       0.52      0.96      0.53     56962
weighted avg       1.00      0.96      0.98     56962


Random Forest
Accuracy: 0.9626
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     56864
           1       0.04      0.93      0.08        98

    accuracy                           0.96     56962
   macro avg       0.52      0.95      0.53     56962
weighted avg       1.00      0.96      0.98     56962



# Different models

In [None]:
from imblearn.under_sampling import TomekLinks, RandomUnderSampler, NearMiss, EditedNearestNeighbours
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Split the data (if not already split)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define sampling techniques
sampling_techniques = {
    "Tomek Links": TomekLinks(),
    "Random Under Sampling": RandomUnderSampler(random_state=42),
    "Near Miss": NearMiss(),
    "Edited Nearest Neighbours": EditedNearestNeighbours(),
    "SMOTE (Synthetic Over Sampling)": SMOTE(random_state=42)
}

# Loop through sampling techniques and evaluate models
for name, sampler in sampling_techniques.items():
    print(f"\n=== {name} ===")
    
    # Apply sampling
    X_resampled, Y_resampled = sampler.fit_resample(X_train_scaled, Y_train)
    
    # Logistic Regression
    log_reg = LogisticRegression(max_iter=200, random_state=42)
    log_reg.fit(X_resampled, Y_resampled)
    log_pred = log_reg.predict(X_test_scaled)
    print("\nLogistic Regression")
    print(f"Accuracy: {accuracy_score(Y_test, log_pred):.4f}")
    print(classification_report(Y_test, log_pred))

    # Random Forest
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_resampled, Y_resampled)
    rf_pred = rf.predict(X_test_scaled)
    print("\nRandom Forest")
    print(f"Accuracy: {accuracy_score(Y_test, rf_pred):.4f}")
    print(classification_report(Y_test, rf_pred))

    # XGBoost
    xgb = XGBClassifier(random_state=42, eval_metric="logloss", n_jobs=-1)
    xgb.fit(X_resampled, Y_resampled)
    xgb_pred = xgb.predict(X_test_scaled)
    print("\nXGBoost")
    print(f"Accuracy: {accuracy_score(Y_test, xgb_pred):.4f}")
    print(classification_report(Y_test, xgb_pred))

    # K-Nearest Neighbors (KNN)
    knn = KNeighborsClassifier(n_jobs=-1)
    knn.fit(X_resampled, Y_resampled)
    knn_pred = knn.predict(X_test_scaled)
    print("\nK-Nearest Neighbors")
    print(f"Accuracy: {accuracy_score(Y_test, knn_pred):.4f}")
    print(classification_report(Y_test, knn_pred))

    # Decision Tree
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_resampled, Y_resampled)
    dt_pred = dt.predict(X_test_scaled)
    print("\nDecision Tree")
    print(f"Accuracy: {accuracy_score(Y_test, dt_pred):.4f}")
    print(classification_report(Y_test, dt_pred))



=== Tomek Links ===

Logistic Regression
Accuracy: 0.9991
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.86      0.58      0.70        98

    accuracy                           1.00     56962
   macro avg       0.93      0.79      0.85     56962
weighted avg       1.00      1.00      1.00     56962



Best Choice:
Model: XGBoost
Technique: Tomek Links
Why?:
XGBoost is robust, performs well in imbalanced datasets, and provides a good trade-off between precision and recall.
Tomek Links reduces noise from the dataset by removing ambiguous samples, which improves minority class performance.
Final Metrics (XGBoost with Tomek Links):
Accuracy: 0.9996
Precision (Class 1): 0.96
Recall (Class 1): 0.79
F1-score (Class 1): 0.87

# Choose Best Model

In [None]:

# Apply Tomek Links
tomek = TomekLinks()
X_resampled, Y_resampled = tomek.fit_resample(X_train_scaled, Y_train)

# Train XGBoost
model = XGBClassifier(random_state=42, eval_metric="logloss")
model.fit(X_resampled, Y_resampled)

# Evaluate
predictions = model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(Y_test, predictions))
print(classification_report(Y_test, predictions))



Accuracy: 0.934010152284264
              precision    recall  f1-score   support

           0       0.90      0.98      0.94        99
           1       0.98      0.89      0.93        98

    accuracy                           0.93       197
   macro avg       0.94      0.93      0.93       197
weighted avg       0.94      0.93      0.93       197



# Save the model

In [None]:
import joblib
from imblearn.under_sampling import TomekLinks
# Save and load the model
joblib.dump(model, "xgb_model.pkl")
print("Model saved as 'xgb_model.pkl'.")



Model saved as 'xgb_model.pkl'.
