In [1]:
# Importing necessary libraries
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis

# Importing train_test_split to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Importing LogisticRegression for logistic regression model
from sklearn.linear_model import LogisticRegression

# Importing accuracy_score to evaluate the model's performance
from sklearn.metrics import accuracy_score


In [2]:
# Loading the dataset into a Pandas DataFrame
credit_card_data = pd.read_csv('dataset/creditcard.csv')  # Reading the credit card dataset from a CSV file


In [5]:
# Displaying the first five rows of the dataset
credit_card_data.head()  # Using the head() function to preview the top five rows of the DataFrame


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [7]:
# Displaying the shape of the dataset
credit_card_data.shape  # Using the shape attribute to get the number of rows and columns in the DataFrame


(284807, 31)

In [9]:
# Displaying the dataset information
credit_card_data.info()  # Using the info() method to get a concise summary of the DataFrame, including the data types and non-null values


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [11]:
# Checking for missing values in the dataset
credit_card_data.isnull().sum()  # Using isnull() and sum() to count the number of missing values in each column of the DataFrame


Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [13]:
# Distribution of legitimate transactions and fraudulent transactions
credit_card_data['Class'].value_counts()  # Using value_counts() to count the occurrences of each class (0 for legit, 1 for fraudulent) in the 'Class' column


Class
0    284315
1       492
Name: count, dtype: int64

In [15]:
# This dataset is highly unbalanced
# 0 --> Normal Transaction
# 1 --> Fraudulent Transaction

# Separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]  # Creating a DataFrame with only legitimate transactions
fraud = credit_card_data[credit_card_data.Class == 1]  # Creating a DataFrame with only fraudulent transactions


In [17]:
# Displaying the shape of the legitimate transactions DataFrame
legit.shape  # Using the shape attribute to get the number of rows and columns in the legit DataFrame


(284315, 31)

In [19]:
# Displaying the shape of the fraudulent transactions DataFrame
fraud.shape  # Using the shape attribute to get the number of rows and columns in the fraud DataFrame

# Insight: Number of fraud transactions is much less than legit transactions
# Fraud transactions are approximately 0.17% of legitimate transactions


(492, 31)

In [22]:
# Descriptive statistics for the Amount of legitimate transactions
legit.Amount.describe()  # Using the describe() method to get summary statistics (count, mean, std, min, 25%, 50%, 75%, max) for the 'Amount' column in the legit DataFrame


count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [23]:
# Compare the mean values for both types of transactions
credit_card_data.groupby('Class').mean()  # Using groupby() to group the data by 'Class' and then calculating the mean for each group

# Insight: There is a significant difference between the mean of normal transactions and fraudulent transactions 
# for every PCA component.


Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [25]:
# Under-Sampling
# Build a sample dataset containing a similar distribution of normal transactions and fraudulent transactions
# Number of fraudulent transactions --> 492

# Creating a sample of legitimate transactions with the same number of entries as fraudulent transactions
legit_sample = legit.sample(n=492)  # Using sample() to randomly select 492 entries from the legitimate transactions DataFrame


In [27]:
# Concatenating the two DataFrames
new_dataset = pd.concat([legit_sample, fraud], axis=0)  # Using concat() to combine the sampled legitimate transactions and all fraudulent transactions along the rows (axis=0)


In [31]:
# Displaying the first five rows of the new dataset
new_dataset.head()  # Using the head() function to preview the top five rows of the concatenated DataFrame


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
212630,138934.0,0.057882,0.864656,0.231735,-0.620513,0.473081,-1.041149,1.006732,-0.195905,-0.028062,...,-0.271327,-0.601652,0.049907,-0.118321,-0.471348,0.147594,0.247038,0.095723,2.58,0
125031,77514.0,1.262596,-0.161896,0.462786,-0.245087,-0.668115,-0.537432,-0.391293,0.064,0.32163,...,-0.116252,-0.40958,0.057185,0.036314,0.094702,0.910872,-0.082065,-0.006358,2.26,0
57682,48012.0,1.14216,-0.018281,0.240965,1.222289,-0.132475,0.278956,-0.195308,0.282078,0.397965,...,-0.097492,-0.190981,-0.088247,-0.349663,0.599787,-0.302344,0.020648,-0.002246,9.0,0
16883,28263.0,1.141315,-0.951765,-0.074941,-0.790461,-0.732216,-0.175276,-0.472864,0.048217,-0.991942,...,0.327187,0.569871,-0.225374,-0.283458,0.494687,-0.093017,-0.022259,0.011375,125.75,0
26709,34216.0,1.189451,0.275206,0.402907,0.725453,-0.391748,-0.802063,-0.020295,-0.05559,0.087825,...,-0.244641,-0.699701,0.20539,0.345343,0.08157,0.105707,-0.009713,0.029592,1.98,0


In [33]:
# Checking the distribution of the 'Class' column in the new dataset
new_dataset['Class'].value_counts()  # Using value_counts() to count the occurrences of each class (0 for legit, 1 for fraudulent) in the 'Class' column of the new dataset


Class
0    492
1    492
Name: count, dtype: int64

In [37]:
# Comparing the mean values for both types of transactions in the new dataset
new_dataset.groupby('Class').mean()  # Using groupby() to group the data by 'Class' and then calculating the mean for each group

# Insight: Despite the under-sampling, the dataset remains highly unbalanced.


Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,96060.558943,0.11133,0.072713,-0.001155,0.023651,-0.037147,-0.057567,0.062,0.059066,0.001833,...,-0.00981,-0.013956,0.018107,-0.023643,-0.049097,0.007916,-0.000729,0.000529,0.011109,82.017459
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [114]:
# Now we will split the data into Features and Targets
# Then we will feed this data to our machine learning model
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

# Split the data into features and target variable
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

# Print the shapes of the datasets
print(X.shape, X_train.shape, X_test.shape)

# Create and fit a pipeline with scaling and Random Forest
pipeline_rf = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100, random_state=2))
pipeline_rf.fit(X_train, Y_train)

# Accuracy on training data for Random Forest
X_train_prediction_rf = pipeline_rf.predict(X_train)
training_data_accuracy_rf = accuracy_score(Y_train, X_train_prediction_rf)
print('Accuracy on Training data (Random Forest):', training_data_accuracy_rf)

# Accuracy on test data for Random Forest
X_test_prediction_rf = pipeline_rf.predict(X_test)
test_data_accuracy_rf = accuracy_score(Y_test, X_test_prediction_rf)
print('Accuracy score on Test Data (Random Forest):', test_data_accuracy_rf)

# Create and fit a pipeline with scaling and Support Vector Machine
pipeline_svm = make_pipeline(StandardScaler(), SVC(kernel='linear', random_state=2))
pipeline_svm.fit(X_train, Y_train)

# Accuracy on training data for SVM
X_train_prediction_svm = pipeline_svm.predict(X_train)
training_data_accuracy_svm = accuracy_score(Y_train, X_train_prediction_svm)
print('Accuracy on Training data (SVM):', training_data_accuracy_svm)

# Accuracy on test data for SVM
X_test_prediction_svm = pipeline_svm.predict(X_test)
test_data_accuracy_svm = accuracy_score(Y_test, X_test_prediction_svm)
print('Accuracy score on Test Data (SVM):', test_data_accuracy_svm)


(984, 30) (787, 30) (197, 30)
Accuracy on Training data (Random Forest): 1.0
Accuracy score on Test Data (Random Forest): 0.9187817258883249
Accuracy on Training data (SVM): 0.9567979669631512
Accuracy score on Test Data (SVM): 0.9187817258883249


# Analysis Report

## Dataset Shapes
- The entire dataset has **984 samples** and **30 features**.
- The training set has **787 samples** and **30 features**.
- The test set has **197 samples** and **30 features**.

## Accuracy
- The logistic regression model achieved **95.30% accuracy** on the training data.
- It achieved **92.39% accuracy** on the test data.

## Insights

### 1. High Training Accuracy
- The training accuracy of **95.30%** indicates that the model is performing very well on the data it was trained on, effectively learning the patterns in the training data.

### 2. High Test Accuracy
- The test accuracy of **92.39%** suggests that the model is also performing well on unseen data, indicating good generalization capabilities.

### 3. Generalization Performance
- The slight drop in accuracy from training (**95.30%**) to test (**92.39%**) is expected and typical. It indicates that the model is not overfitting significantly, as there is no drastic drop in performance. A small drop suggests that the model has generalized well from the training data to the test data.

### 4. Model Evaluation
- The high accuracy on both training and test datasets indicates that the preprocessing steps (such as scaling) and the logistic regression model are well-suited for this classification task. However, it is important to note that accuracy is just one metric. For a more comprehensive understanding, other metrics like precision, recall, F1-score, and the confusion matrix should also be considered.
accuracy metrics.

In [116]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Evaluation on training data
print("Evaluation on Training Data:")
precision_train = precision_score(Y_train, X_train_prediction, average='weighted')
recall_train = recall_score(Y_train, X_train_prediction, average='weighted')
f1_train = f1_score(Y_train, X_train_prediction, average='weighted')

print('Precision on Training Data:', precision_train)
print('Recall on Training Data:', recall_train)
print('F1-Score on Training Data:', f1_train)
print('Classification Report for Training Data:\n', classification_report(Y_train, X_train_prediction))

# Evaluation on test data
print("\nEvaluation on Test Data:")
precision_test = precision_score(Y_test, X_test_prediction, average='weighted')
recall_test = recall_score(Y_test, X_test_prediction, average='weighted')
f1_test = f1_score(Y_test, X_test_prediction, average='weighted')

print('Precision on Test Data:', precision_test)
print('Recall on Test Data:', recall_test)
print('F1-Score on Test Data:', f1_test)
print('Classification Report for Test Data:\n', classification_report(Y_test, X_test_prediction))


Evaluation on Training Data:
Precision on Training Data: 0.9545421748183011
Recall on Training Data: 0.9529860228716646
F1-Score on Training Data: 0.9529475842725275
Classification Report for Training Data:
               precision    recall  f1-score   support

           0       0.93      0.98      0.95       393
           1       0.98      0.92      0.95       394

    accuracy                           0.95       787
   macro avg       0.95      0.95      0.95       787
weighted avg       0.95      0.95      0.95       787


Evaluation on Test Data:
Precision on Test Data: 0.925983364399225
Recall on Test Data: 0.9238578680203046
F1-Score on Test Data: 0.9237478161583647
Classification Report for Test Data:
               precision    recall  f1-score   support

           0       0.90      0.96      0.93        99
           1       0.96      0.89      0.92        98

    accuracy                           0.92       197
   macro avg       0.93      0.92      0.92       197
weigh

# Insights from the Model Evaluation

## 1. Training Data Evaluation
- **Precision:** 95.45%
- **Recall:** 95.30%
- **F1-Score:** 95.29%
- **Classification Report:**
  - **Class 0:** High precision (93%) and recall (98%) with an F1-score of 95%.
  - **Class 1:** High precision (98%) and recall (92%) with an F1-score of 95%.
  - **Overall accuracy:** 95%

## 2. Test Data Evaluation
- **Precision:** 92.60%
- **Recall:** 92.39%
- **F1-Score:** 92.37%
- **Classification Report:**
  - **Class 0:** High precision (90%) and recall (96%) with an F1-score of 93%.
  - **Class 1:** High precision (96%) and recall (89%) with an F1-score of 92%.
  - **Overall accuracy:** 92%

## Detailed Insights

### 1. Model Performance
- The model performs well on both the training and test datasets, indicating good generalization.
- Training accuracy is slightly higher than test accuracy, which is expected but should not be significantly different to avoid overfitting.

### 2. Class-wise Performance
- **For Class 0:**
  - High recall (98% on training, 96% on test) indicates that most of the actual Class 0 samples are correctly identified.
  - High precision (93% on training, 90% on test) indicates that most of the predicted Class 0 samples are actually Class 0.
- **For Class 1:**
  - High precision (98% on training, 96% on test) indicates that most of the predicted Class 1 samples are actually Class 1.
  - Slightly lower recall (92% on training, 89% on test) compared to precision, but still good.

### 3. Overall Metrics
- Both macro and weighted averages of precision, recall, and F1-score are high, indicating balanced performance across both classes.
- The small drop in test performance metrics compared to training metrics suggests that the model is not overfitting and has good generalization capability.
d generalization capability.



In [118]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Random Forest predictions on training and test data
X_train_prediction_rf = pipeline_rf.predict(X_train)
X_test_prediction_rf = pipeline_rf.predict(X_test)

# Accuracy
training_data_accuracy_rf = accuracy_score(Y_train, X_train_prediction_rf)
test_data_accuracy_rf = accuracy_score(Y_test, X_test_prediction_rf)

print('Accuracy on Training data (Random Forest):', training_data_accuracy_rf)
print('Accuracy score on Test Data (Random Forest):', test_data_accuracy_rf)

# Evaluation on training data
print("Evaluation on Training Data (Random Forest):")
precision_train_rf = precision_score(Y_train, X_train_prediction_rf, average='weighted')
recall_train_rf = recall_score(Y_train, X_train_prediction_rf, average='weighted')
f1_train_rf = f1_score(Y_train, X_train_prediction_rf, average='weighted')

print('Precision on Training Data (Random Forest):', precision_train_rf)
print('Recall on Training Data (Random Forest):', recall_train_rf)
print('F1-Score on Training Data (Random Forest):', f1_train_rf)
print('Classification Report for Training Data (Random Forest):\n', classification_report(Y_train, X_train_prediction_rf))

# Evaluation on test data
print("\nEvaluation on Test Data (Random Forest):")
precision_test_rf = precision_score(Y_test, X_test_prediction_rf, average='weighted')
recall_test_rf = recall_score(Y_test, X_test_prediction_rf, average='weighted')
f1_test_rf = f1_score(Y_test, X_test_prediction_rf, average='weighted')

print('Precision on Test Data (Random Forest):', precision_test_rf)
print('Recall on Test Data (Random Forest):', recall_test_rf)
print('F1-Score on Test Data (Random Forest):', f1_test_rf)
print('Classification Report for Test Data (Random Forest):\n', classification_report(Y_test, X_test_prediction_rf))


Accuracy on Training data (Random Forest): 1.0
Accuracy score on Test Data (Random Forest): 0.9187817258883249
Evaluation on Training Data (Random Forest):
Precision on Training Data (Random Forest): 1.0
Recall on Training Data (Random Forest): 1.0
F1-Score on Training Data (Random Forest): 1.0
Classification Report for Training Data (Random Forest):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       393
           1       1.00      1.00      1.00       394

    accuracy                           1.00       787
   macro avg       1.00      1.00      1.00       787
weighted avg       1.00      1.00      1.00       787


Evaluation on Test Data (Random Forest):
Precision on Test Data (Random Forest): 0.9215311710970898
Recall on Test Data (Random Forest): 0.9187817258883249
F1-Score on Test Data (Random Forest): 0.9186307313692161
Classification Report for Test Data (Random Forest):
               precision    recall  f1-score   supp

# Analysis of Random Forest Model Performance

## Training Data Performance
- **Accuracy on Training Data:** 1.0
  - The model has perfectly classified all training samples, achieving 100% accuracy.
- **Precision on Training Data:** 1.0
  - Precision is the ratio of true positive predictions to the total predicted positives. A precision of 1.0 means that every positive prediction made by the model was correct.
- **Recall on Training Data:** 1.0
  - Recall is the ratio of true positive predictions to the total actual positives. A recall of 1.0 means that the model correctly identified all positive samples.
- **F1-Score on Training Data:** 1.0
  - The F1-score, which is the harmonic mean of precision and recall, is 1.0, indicating perfect performance.

### Classification Report for Training Data
- **Class 0 and Class 1:**
  - Precision, recall, and F1-score for both classes are 1.0.
  - The model perfectly distinguishes between the two classes with no errors.

## Test Data Performance
- **Accuracy on Test Data:** 0.9187817258883249 (~91.88%)
  - The model correctly classified approximately 91.88% of the test samples.
- **Precision on Test Data:** 0.9215311710970898 (~92.15%)
  - The precision indicates that about 92.15% of the positive predictions were correct.
- **Recall on Test Data:** 0.9187817258883249 (~91.88%)
  - The recall shows that the model correctly identified about 91.88% of the actual positives.
- **F1-Score on Test Data:** 0.9186307313692161 (~91.86%)
  - The F1-score balances the precision and recall, indicating overall strong performance.

### Classification Report for Test Data
- **Class 0:**
  - Precision: 0.89
  - Recall: 0.96
  - F1-Score: 0.92
  - The model performs well on Class 0, with high recall indicating most actual positives are identified.
- **Class 1:**
  - Precision: 0.96
  - Recall: 0.88
  - F1-Score: 0.91
  - The model also performs well on Class 1, with high precision indicating most predictedorrect.

## Insights and Observations
- **Overfitting:**
  - The model's perfect performance on the training data (accuracy, precision, recall, and F1-score all being 1.0) suggests overfitting. Overfitting occurs when the model learns the training data too well, including noise and outliers, resulting in excellent performance on the training set but not necessarily on unseen data.
- **Generalization:**
  - The test data performance, while still strong, shows a drop from the training data. The accuracy is approximately 91.88%, which is good but indicates the model does not generalize as perfectly as it performs on the training set. This gap between training and test performance is a sign of overfitting.
- **Class Performance:**
  - The model performs slightly better on Class 1 in terms of precision and slightly better on Class 0 in terms of recall. This could indicate a slight bias towards predicting Class 0 correctly.


In [122]:
# SVM predictions on training and test data
X_train_prediction_svm = pipeline_svm.predict(X_train)
X_test_prediction_svm = pipeline_svm.predict(X_test)

# Evaluation on training data
print("Evaluation on Training Data (SVM):")
precision_train_svm = precision_score(Y_train, X_train_prediction_svm, average='weighted')
recall_train_svm = recall_score(Y_train, X_train_prediction_svm, average='weighted')
f1_train_svm = f1_score(Y_train, X_train_prediction_svm, average='weighted')

print('Precision on Training Data (SVM):', precision_train_svm)
print('Recall on Training Data (SVM):', recall_train_svm)
print('F1-Score on Training Data (SVM):', f1_train_svm)
print('Classification Report for Training Data (SVM):\n', classification_report(Y_train, X_train_prediction_svm))

# Evaluation on test data
print("\nEvaluation on Test Data (SVM):")
precision_test_svm = precision_score(Y_test, X_test_prediction_svm, average='weighted')
recall_test_svm = recall_score(Y_test, X_test_prediction_svm, average='weighted')
f1_test_svm = f1_score(Y_test, X_test_prediction_svm, average='weighted')

print('Precision on Test Data (SVM):', precision_test_svm)
print('Recall on Test Data (SVM):', recall_test_svm)
print('F1-Score on Test Data (SVM):', f1_test_svm)
print('Classification Report for Test Data (SVM):\n', classification_report(Y_test, X_test_prediction_svm))


Evaluation on Training Data (SVM):
Precision on Training Data (SVM): 0.957983703306819
Recall on Training Data (SVM): 0.9567979669631512
F1-Score on Training Data (SVM): 0.9567714458639828
Classification Report for Training Data (SVM):
               precision    recall  f1-score   support

           0       0.93      0.98      0.96       393
           1       0.98      0.93      0.96       394

    accuracy                           0.96       787
   macro avg       0.96      0.96      0.96       787
weighted avg       0.96      0.96      0.96       787


Evaluation on Test Data (SVM):
Precision on Test Data (SVM): 0.9203171800611659
Recall on Test Data (SVM): 0.9187817258883249
F1-Score on Test Data (SVM): 0.9186937184705568
Classification Report for Test Data (SVM):
               precision    recall  f1-score   support

           0       0.90      0.95      0.92        99
           1       0.95      0.89      0.92        98

    accuracy                           0.92       197

# Analysis of SVM Model Performance

## Training Data Performance
- **Accuracy on Training Data:** 0.96
  - The model has classified 96% of the training samples correctly.
- **Precision on Training Data:** 0.957983703306819 (~95.80%)
  - Precision is the ratio of true positive predictions to the total predicted positives. A precision of ~95.80% indicates that the majority of positive predictions made by the model were correct.
- **Recall on Training Data:** 0.9567979669631512 (~95.68%)
  - Recall is the ratio of true positive predictions to the total actual positives. A recall of ~95.68% indicates that the model correctly identified most of the positive samples.
- **F1-Score on Training Data:** 0.9567714458639828 (~95.68%)
  - The F1-score, which is the harmonic mean of precision and recall, is ~95.68%, indicating a balanced performance between precision and recall.

### Classification Report for Training Data
- **Class 0:**
  - Precision: 0.93
  - Recall: 0.98
  - F1-Score: 0.96
  - The model performs well on Class 0 with high recall.
- **Class 1:**
  - Precision: 0.98
  - Recall: 0.93
  - F1-Score: 0.96
  - The model performs well on Class 1 with high precision.

## Test Data Performance
- **Accuracy on Test Data:** 0.9187817258883249 (~91.88%)
  - The model correctly classified approximately 91.88% of the test samples.
- **Precision on Test Data:** 0.9203171800611659 (~92.03%)
  - Precision indicates that about 92.03% of the positive predictions were correct.
- **Recall on Test Data:** 0.9187817258883249 (~91.88%)
  - Recall shows that the model correctly identified about 91.88% of the actual positives.
- **F1-Score on Test Data:** 0.9186937184705568 (~91.87%)
  - The F1-score balances the precision and recall, indicating overall strong performance.

### Classification Report for Test Data
- **Class 0:**
  - Precision: 0.90
  - Recall: 0.95
  - F1-Score: 0.92
  - The model performs well on Class 0, with high recall indicating most actual positives are identified.
- **Class 1:**
  - Precision: 0.95
  - Recall: 0.89
  - F1-Score: 0.92
  - The model also performs well on Class 1, with high precision indicating most predicted positives are correct.

## Insights and Observations
- **Good Generalization:**
  - The SVM model has a high accuracy of 96% on the training data and approximately 91.88% on the test data. This indicates that the model generalizes well to unseen data and does not overfit as severely as the Random Forest model.
- **Balanced Performance:**
  - The precision, recall, and F1-scores are consistently high for both the training and test datasets, indicating that the model maintains a balanced performance across different metrics.
- **Class Performance:**
  - The model performs slightly better on Class 1 in terms of precision and slightly better on Class 0 in terms of recall on the test data. This could indicate aslight bias towards predicting Class 0 correctly.

## Comparison with Random Forest
- **Training Performance:**
  - Random Forest shows perfect performance on the training data (indicating overfitting), while SVM has high but not perfect accuracy, suggesting better generalization.
- **Test Performance:**
  - Both models have similar test accuracy (~91.88%). However, SVM shows a more balanced performance between training and test data, indicating better generalization compared to Random Forest.
- **Precision and Recall:**
  - SVM maintains high precision and recall on both training and test data, while Random Forest shows a slight drop in test performance compared to its training performance.
