<a href="https://colab.research.google.com/github/Naman-27072004/E-commerce/blob/main/Credit_card_fraud_detection_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import numpy as np  # For numerical operations
import pandas as pd  # For data manipulation and analysis
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.linear_model import LogisticRegression  # For logistic regression model
from sklearn.metrics import accuracy_score  # For evaluating model accuracy
import warnings  # For handling warnings

# Suppress DtypeWarning to avoid warnings about mixed data types
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# Load the credit card fraud dataset, skipping problematic lines
df = pd.read_csv('/content/creditcard.csv', on_bad_lines='skip')

# Data preprocessing:
df.dropna(inplace=True)  # Remove rows with missing values
df.drop_duplicates(inplace=True)  # Remove duplicate rows

# Separate legitimate and fraudulent transactions
legit = df[df.Class == 0]  # Transactions labeled as legitimate (Class 0)
fraud = df[df.Class == 1]  # Transactions labeled as fraud (Class 1)

# Create balanced samples for training (450 each)
legit_sample = legit.sample(n=450)  # Randomly sample 450 legitimate transactions
fraud_sample = fraud.sample(n=450)  # Randomly sample 450 fraudulent transactions

# Combine the samples to create the training dataset
df = pd.concat([legit_sample, fraud_sample], axis=0)

# Split data into features (X) and target (Y)
X = df.drop(columns='Class', axis=1)  # Features (all columns except 'Class')
Y = df['Class']  # Target variable (Class: 0 or 1)

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)
# test_size=0.2: 20% of data for testing
# stratify=Y: Ensures similar class distribution in train and test sets
# random_state=2: For reproducibility

# Create and train the logistic regression model
# Increase max_iter to address potential ConvergenceWarning
model = LogisticRegression(max_iter=10000)
model.fit(X_train, Y_train)  # Train the model on the training data

# Make predictions on training and testing data
X_train_prediction = model.predict(X_train)
X_test_prediction = model.predict(X_test)

# Evaluate model performance
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)  # Calculate training accuracy
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)  # Calculate testing accuracy

# Print the accuracy scores
print('Accuracy on Training data : ', training_data_accuracy)
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Training data :  0.9333333333333333
Accuracy on Test data :  0.9611111111111111
