In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_curve, roc_auc_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv("/kaggle/input/credit-card-prediction/creditcard.csv")
data.head()
print(data.info())
print(data.describe())
print("Missing values:\n", data.isnull().sum())
class_distribution = data['Class'].value_counts()
if len(class_distribution) == 2:
    print("Both classes are present in the dataset.")
    print("Class 0 (non-fraudulent transactions):", class_distribution[0])
    print("Class 1 (fraudulent transactions):", class_distribution[1])
else:
    print("The dataset does not contain instances of both classes.")
plt.figure(figsize=(8, 6))
sns.countplot(x='Class', data=data)
plt.title('Class Distribution')
plt.show()
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(data[data['Class'] == 0]['Amount'], bins=50, kde=True, color='blue', label='Class 0')
sns.histplot(data[data['Class'] == 1]['Amount'], bins=50, kde=True, color='red', label='Class 1')
plt.title('Distribution of Amount for Class 0 and 1')
plt.legend()
plt.subplot(1, 2, 2)
sns.histplot(data[data['Class'] == 0]['Time'], bins=50, kde=True, color='blue', label='Class 0')
sns.histplot(data[data['Class'] == 1]['Time'], bins=50, kde=True, color='red', label='Class 1')
plt.title('Distribution of Time for Class 0 and 1')
plt.legend()
plt.tight_layout()
plt.show()
num_features = data.drop(['Time', 'Amount', 'Class'], axis=1).columns
plt.figure(figsize=(16, 20))
for i, feature in enumerate(num_features, 1):
    plt.subplot(7, 4, i)
    sns.histplot(data[data['Class'] == 0][feature], bins=30, kde=True, color='blue', label='Class 0')
    sns.histplot(data[data['Class'] == 1][feature], bins=30, kde=True, color='red', label='Class 1')
    plt.title(f'Distribution of {feature} for Class 0 and 1')
    plt.legend()
plt.tight_layout()
plt.show()
plt.figure(figsize=(16, 6))
for i, feature in enumerate(['Amount', 'V4', 'V9', 'V10'], 1):
    plt.subplot(1, 4, i)
    sns.boxplot(x='Class', y=feature, data=data)
    plt.title(f'Boxplot of {feature}')
plt.tight_layout()
plt.show()
subset_features = ['Time', 'Amount', 'V1', 'V2', 'V3', 'V4']
sns.pairplot(data[subset_features + ['Class']], hue='Class', diag_kind='kde')
plt.suptitle('Pairplot of Selected Features', y=1.02)
plt.show()
corr_matrix = data.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred
print("Classification Report:\n", classification_report(y_test, y_pred))
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()
print("Accuracy Score:", accuracy_score(y_test, y_pred))
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
feature_importances = model.feature_importances_
sorted_idx = np.argsort(feature_importances)[::-1]
plt.figure(figsize=(12, 6))
sns.barplot(x=feature_importances[sorted_idx], y=X.columns[sorted_idx], palette='viridis')
plt.title('Feature Importances')
plt.show()