In [None]:
"""
 Build a machine learning model to identify fraudulent credit card
 transactions.
 Preprocess and normalize the transaction data, handle class
 imbalance issues, and split the dataset into training and testing sets.
 Train a classification algorithm, such as logistic regression or random
 forests, to classify transactions as fraudulent or genuine.
 Evaluate the model's performance using metrics like precision, recall,
 and F1-score, and consider techniques like oversampling or
 undersampling for improving results
 
"""

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
credit_data = pd.read_csv("creditcard.csv")

In [None]:
credit_data.head()

In [None]:
credit_data.tail()

In [None]:
credit_data.shape

In [None]:
credit_data.info()

In [None]:
#checking null values
credit_data.isnull().sum()

In [None]:
credit_data.describe()

# data analysis 

In [None]:
#columns in dataset
credit_data.columns

In [None]:
#unique value in class columns
credit_data.Class.unique()

In [None]:
#class columns has two values i.e. 0 & 1 where 1 is fraud case and valid case

In [None]:
fraud_case = credit_data[credit_data['Class'] == 1]
valid_case = credit_data[credit_data['Class'] == 0]

print("Number of fraud case:", (len(fraud_case)))
print("Number of valid case:", (len(valid_case)))

total = len(fraud_case) / len(valid_case)
total

In [None]:
#in percentage
per = len(fraud_case) / len(credit_data['Class'])*100
per

In [None]:
#only 0.17% transactions are fraud

# class distribution

In [None]:
#count the number of occurences for each class (0 for legitimate, 1 for fraudulent)
class_counts = credit_data['Class'].value_counts()

#calculate the percentage of fraudulent transactions
percentage_fraudulent  = (class_counts[1] / class_counts.sum())*100

plt.figure(figsize=(8,6))
sns.countplot(x='Class', data=credit_data)
plt.title("distribution of legitimate vs fraudulent transactions")
plt.xlabel("Class (0: legitimate, 1: fraudulent)")
plt.ylabel("Count")

#display thr percentage of fraudulent transactions on the plot
plt.text(0, class_counts[0] + 1000, 'percentage of fraudulent transactions: {:.2f}%'.format(percentage_fraudulent), fontsize=12, ha='center')

plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(credit_data['Time'], bins=48, kde=True)
plt.title("transaction time distribution")
plt.xlabel("Time in sec")
plt.ylabel("Count")

#check if there are specific times of day when fraud is more likely to occur
plt.figure(figsize=(10,6))
sns.histplot(credit_data[credit_data['Class'] == 1]['Time'], bins=48, kde=True, color='red', label='Fraudulent')
sns.histplot(credit_data[credit_data['Class'] == 0]['Time'], bins=48, kde=True, color='blue', label='Legitimate')
plt.title("fraud vs legitimate transaction time distribution")
plt.xlabel("Time in sec")
plt.ylabel("Count")
plt.legend()

plt.show()

# transaction amount analysis

In [None]:
#calculate summary statistics for both groups
legitimate_transactions = credit_data[credit_data['Class'] == 0]
legitimate_summary = legitimate_transactions.describe()

fraudulent_transactions = credit_data[credit_data['Class'] == 1]
fraudulent_summary = fraudulent_transactions.describe()

print("summary statistics for legitimate transactions: ")
print(legitimate_summary)
print("\nsummary statistics for fraudulent transactions: ")
print(fraudulent_summary)

In [None]:
#separate data fro legitimate and fraudulent transactions
legitimate_transactions = credit_data[credit_data['Class'] == 0]['Amount']
fraudulent_transactions = credit_data[credit_data['Class'] == 1]['Amount']

plt.figure(figsize=(12,6))
sns.histplot(legitimate_transactions, color='blue', kde=True, label='legitimate transactions', bins=50)
sns.histplot(fraudulent_transactions, color='red', kde=True, label='fraudulent transactions', bins=50)

plt.title('transaction amount distribution comparison')
plt.xlabel('transaction amount')
plt.ylabel('frequency')
plt.legend()

plt.show()

In [None]:
#calculate the correlation matrix
correlation_matrix = credit_data.corr()
print(correlation_matrix)

#filter the correlations of features with the target variable('Class')
feature_correlations = correlation_matrix['Class'].drop('Class')
print(feature_correlations)

correlation_threshold = 0.1

highly_correlated_features = feature_correlations[abs(feature_correlations) > correlation_threshold]

plt.figure(figsize=(12,6))
sns.barplot(x=highly_correlated_features.index, y=highly_correlated_features.values, palette='magma')
plt.title('Feature Correlations with Fraud (Class)')
plt.xlabel('Features')
plt.ylabel('Correlation')
plt.xticks(rotation=45)
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x=credit_data['Time'], y=credit_data['Amount'], hue=credit_data['Class'], palette='coolwarm', alpha=0.5)
plt.title('Transaction Amount vs Time')
plt.xlabel('Time (seconds)')
plt.ylabel('Amount')
plt.show()

In [None]:
#visualize heatmap for correlation matrix between features
correlation_matrix = credit_data.corr()
plt.figure(figsize=(12,10))
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False, fmt=".2f", linewidths=0.5)
plt.title('Correlation heatmap between features')
plt.show()

# developing model

In [None]:
#import libraries
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,f1_score, classification_report, confusion_matrix)

In [None]:
X = credit_data.drop(['Class'], axis=1)
Y = credit_data['Class']

print(f"vlaue and shapes :", [X.values, X.shape])
print(f"vlaue and shapes :", [Y.values, Y.shape])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 42)

In [None]:
model = RandomForestClassifier(n_estimators = 100, random_state = 42)
model.fit(X_train,Y_train)
y_pred = model.predict(X_test)

In [None]:
print(y_pred)
print(Y_test)