# Phase 1: Data Preparation & Baseline Model\n
\n
## Goals:\n
- Load and explore creditcard.csv dataset\n
- Analyze class imbalance\n
- Preprocess data\n
- Build baseline models (Logistic Regression, Random Forest)\n
- Evaluate baseline performance

##

In [None]:
import numpy as np\n
import pandas as pd\n
import matplotlib.pyplot as plt\n
import seaborn as sns\n
from sklearn.model_selection import train_test_split\n
from sklearn.preprocessing import StandardScaler\n
from sklearn.linear_model import LogisticRegression\n
from sklearn.ensemble import RandomForestClassifier\n
from sklearn.metrics import (\n
    accuracy_score, precision_score, recall_score, f1_score,\n
    confusion_matrix, classification_report, roc_auc_score, roc_curve\n
)\n
import warnings\n
warnings.filterwarnings('ignore')\n
\n
# Set style\n
sns.set_style('whitegrid')\n
plt.rcParams['figure.figsize'] = (12, 6)\n
\n
print('All libraries imported successfully!')

## Step 2: Load Dataset

In [None]:
# Load the creditcard.csv dataset\n
# Download from: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud\n
\n
try:\n
    df = pd.read_csv('../../data/raw/creditcard.csv')\n
    print(f'Dataset loaded successfully!')\n
    print(f'Shape: {df.shape}')\n
    print(f'\\nColumns: {df.columns.tolist()}')\n
except FileNotFoundError:\n
    print('ERROR: creditcard.csv not found in data/raw/ folder')\n
    print('Please download from: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud')\n
    print('And place it in: data/raw/creditcard.csv')

In [None]:
# Display first few rows\n
df.head()

In [None]:
# Check data info\n
df.info()

In [None]:
# Check for missing values\n
print('Missing Values:')\n
print(df.isnull().sum())\n
print(f'\\nTotal missing values: {df.isnull().sum().sum()}')

## Step 3: Analyze Class Imbalance

In [None]:
# Check class distribution\n
class_counts = df['Class'].value_counts()\n
class_percentages = df['Class'].value_counts(normalize=True) * 100\n
\n
print('Class Distribution:')\n
print(f'Normal transactions (0): {class_counts[0]:,} ({class_percentages[0]:.2f}%)')\n
print(f'Fraud transactions (1): {class_counts[1]:,} ({class_percentages[1]:.2f}%)')\n
print(f'\\nImbalance Ratio: {class_counts[0] / class_counts[1]:.2f}:1')

In [None]:
# Visualize class distribution\n
fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n
\n
# Count plot\n
sns.countplot(data=df, x='Class', ax=axes[0], palette=['#2ecc71', '#e74c3c'])\n
axes[0].set_title('Class Distribution (Count)', fontsize=14, fontweight='bold')\n
axes[0].set_xlabel('Class (0=Normal, 1=Fraud)', fontsize=12)\n
axes[0].set_ylabel('Count', fontsize=12)\n
\n
# Percentage plot\n
class_percentages.plot(kind='bar', ax=axes[1], color=['#2ecc71', '#e74c3c'])\n
axes[1].set_title('Class Distribution (Percentage)', fontsize=14, fontweight='bold')\n
axes[1].set_xlabel('Class (0=Normal, 1=Fraud)', fontsize=12)\n
axes[1].set_ylabel('Percentage (%)', fontsize=12)\n
axes[1].set_xticklabels(['Normal', 'Fraud'], rotation=0)\n
\n
plt.tight_layout()\n
plt.show()

## Step 4: Exploratory Data Analysis

In [None]:
# Statistical summary\n
df.describe()

In [None]:
# Analyze Amount distribution\n
fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n
\n
# Amount distribution by class\n
df[df['Class'] == 0]['Amount'].hist(bins=50, ax=axes[0], alpha=0.7, label='Normal', color='#2ecc71')\n
df[df['Class'] == 1]['Amount'].hist(bins=50, ax=axes[0], alpha=0.7, label='Fraud', color='#e74c3c')\n
axes[0].set_title('Transaction Amount Distribution', fontsize=14, fontweight='bold')\n
axes[0].set_xlabel('Amount', fontsize=12)\n
axes[0].set_ylabel('Frequency', fontsize=12)\n
axes[0].legend()\n
\n
# Boxplot\n
df.boxplot(column='Amount', by='Class', ax=axes[1], patch_artist=True)\n
axes[1].set_title('Transaction Amount by Class', fontsize=14, fontweight='bold')\n
axes[1].set_xlabel('Class (0=Normal, 1=Fraud)', fontsize=12)\n
axes[1].set_ylabel('Amount', fontsize=12)\n
\n
plt.tight_layout()\n
plt.show()

In [None]:
# Correlation heatmap (sample for visualization)\n
plt.figure(figsize=(12, 10))\n
correlation = df.corr()\n
sns.heatmap(correlation[['Class']].sort_values(by='Class', ascending=False).head(15), \n
            annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)\n
plt.title('Top 15 Features Correlated with Fraud Class', fontsize=14, fontweight='bold')\n
plt.tight_layout()\n
plt.show()

## Step 5: Data Preprocessing

In [None]:
# Separate features and target\n
X = df.drop('Class', axis=1)\n
y = df['Class']\n
\n
print(f'Features shape: {X.shape}')\n
print(f'Target shape: {y.shape}')

In [None]:
# Normalize Amount and Time\n
scaler = StandardScaler()\n
X['Amount'] = scaler.fit_transform(X['Amount'].values.reshape(-1, 1))\n
X['Time'] = scaler.fit_transform(X['Time'].values.reshape(-1, 1))\n
\n
print('Amount and Time normalized!')\n
print(f'\\nNormalized data sample:')\n
X[['Time', 'Amount']].head()

In [None]:
# Train-test split\n
X_train, X_test, y_train, y_test = train_test_split(\n
    X, y, test_size=0.2, random_state=42, stratify=y\n
)\n
\n
print(f'Training set: {X_train.shape}')\n
print(f'Test set: {X_test.shape}')\n
print(f'\\nTraining set class distribution:')\n
print(y_train.value_counts())\n
print(f'\\nTest set class distribution:')\n
print(y_test.value_counts())

In [None]:
# Save preprocessed data\n
import os\n
\n
os.makedirs('../../data/processed', exist_ok=True)\n
\n
X_train.to_csv('../../data/processed/X_train.csv', index=False)\n
X_test.to_csv('../../data/processed/X_test.csv', index=False)\n
y_train.to_csv('../../data/processed/y_train.csv', index=False)\n
y_test.to_csv('../../data/processed/y_test.csv', index=False)\n
\n
print('Preprocessed data saved to data/processed/')

## Step 6: Baseline Model - Logistic Regression

In [None]:
# Train Logistic Regression\n
print('Training Logistic Regression...')\n
lr_model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')\n
lr_model.fit(X_train, y_train)\n
\n
# Predictions\n
y_pred_lr = lr_model.predict(X_test)\n
y_pred_proba_lr = lr_model.predict_proba(X_test)[:, 1]\n
\n
print('\\nLogistic Regression trained successfully!')

In [None]:
# Evaluate Logistic Regression\n
print('=== LOGISTIC REGRESSION RESULTS ===')\n
print(f'Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}')\n
print(f'Precision: {precision_score(y_test, y_pred_lr):.4f}')\n
print(f'Recall: {recall_score(y_test, y_pred_lr):.4f}')\n
print(f'F1-Score: {f1_score(y_test, y_pred_lr):.4f}')\n
print(f'ROC-AUC: {roc_auc_score(y_test, y_pred_proba_lr):.4f}')\n
print('\\nClassification Report:')\n
print(classification_report(y_test, y_pred_lr, target_names=['Normal', 'Fraud']))\n
print('\\nConfusion Matrix:')\n
print(confusion_matrix(y_test, y_pred_lr))

## Step 7: Baseline Model - Random Forest

In [None]:
# Train Random Forest\n
print('Training Random Forest...')\n
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, \n
                                   class_weight='balanced', n_jobs=-1)\n
rf_model.fit(X_train, y_train)\n
\n
# Predictions\n
y_pred_rf = rf_model.predict(X_test)\n
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]\n
\n
print('\\nRandom Forest trained successfully!')

In [None]:
# Evaluate Random Forest\n
print('=== RANDOM FOREST RESULTS ===')\n
print(f'Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}')\n
print(f'Precision: {precision_score(y_test, y_pred_rf):.4f}')\n
print(f'Recall: {recall_score(y_test, y_pred_rf):.4f}')\n
print(f'F1-Score: {f1_score(y_test, y_pred_rf):.4f}')\n
print(f'ROC-AUC: {roc_auc_score(y_test, y_pred_proba_rf):.4f}')\n
print('\\nClassification Report:')\n
print(classification_report(y_test, y_pred_rf, target_names=['Normal', 'Fraud']))\n
print('\\nConfusion Matrix:')\n
print(confusion_matrix(y_test, y_pred_rf))

## Step 8: Visual Comparison

In [None]:
# Compare models visually\n
models = ['Logistic Regression', 'Random Forest']\n
accuracy = [accuracy_score(y_test, y_pred_lr), accuracy_score(y_test, y_pred_rf)]\n
precision = [precision_score(y_test, y_pred_lr), precision_score(y_test, y_pred_rf)]\n
recall = [recall_score(y_test, y_pred_lr), recall_score(y_test, y_pred_rf)]\n
f1 = [f1_score(y_test, y_pred_lr), f1_score(y_test, y_pred_rf)]\n
\n
fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n
\n
# Accuracy\n
axes[0, 0].bar(models, accuracy, color=['#3498db', '#e67e22'])\n
axes[0, 0].set_title('Accuracy Comparison', fontsize=12, fontweight='bold')\n
axes[0, 0].set_ylim([0, 1])\n
\n
# Precision\n
axes[0, 1].bar(models, precision, color=['#3498db', '#e67e22'])\n
axes[0, 1].set_title('Precision Comparison', fontsize=12, fontweight='bold')\n
axes[0, 1].set_ylim([0, 1])\n
\n
# Recall\n
axes[1, 0].bar(models, recall, color=['#3498db', '#e67e22'])\n
axes[1, 0].set_title('Recall Comparison', fontsize=12, fontweight='bold')\n
axes[1, 0].set_ylim([0, 1])\n
\n
# F1-Score\n
axes[1, 1].bar(models, f1, color=['#3498db', '#e67e22'])\n
axes[1, 1].set_title('F1-Score Comparison', fontsize=12, fontweight='bold')\n
axes[1, 1].set_ylim([0, 1])\n
\n
plt.tight_layout()\n
plt.show()

In [None]:
# ROC Curve comparison\n
fig, ax = plt.subplots(figsize=(10, 7))\n
\n
# Plot ROC curves\n
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_proba_lr)\n
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)\n
\n
ax.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {roc_auc_score(y_test, y_pred_proba_lr):.3f})', linewidth=2)\n
ax.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {roc_auc_score(y_test, y_pred_proba_rf):.3f})', linewidth=2)\n
ax.plot([0, 1], [0, 1], 'k--', label='Random Classifier')\n
\n
ax.set_xlabel('False Positive Rate', fontsize=12)\n
ax.set_ylabel('True Positive Rate', fontsize=12)\n
ax.set_title('ROC Curve Comparison - Baseline Models', fontsize=14, fontweight='bold')\n
ax.legend(fontsize=10)\n
ax.grid(alpha=0.3)\n
\n
plt.tight_layout()\n
plt.show()

## Step 9: Save Baseline Models

In [None]:
import joblib\n
\n
# Save models\n
os.makedirs('../../models/saved_models', exist_ok=True)\n
\n
joblib.dump(lr_model, '../../models/saved_models/logistic_regression.pkl')\n
joblib.dump(rf_model, '../../models/saved_models/random_forest.pkl')\n
\n
print('Baseline models saved successfully!')\n
print('- logistic_regression.pkl')\n
print('- random_forest.pkl')

## Phase 1 Summary\n
\n
### âœ… Completed:\n
1. Loaded creditcard.csv dataset\n
2. Analyzed class imbalance (~0.17% fraud)\n
3. Performed exploratory data analysis\n
4. Preprocessed and normalized data\n
5. Built baseline models (Logistic Regression & Random Forest)\n
6. Evaluated and compared baseline performance\n
7. Saved preprocessed data and models\n
\n
### ðŸ”¥ Key Findings:\n
- Dataset is highly imbalanced\n
- Baseline models provide initial benchmarks\n
- Ready for deep learning models in Phase 2\n
\n
### ðŸ“Š Next Steps:\n
- Move to notebook 02: LSTM Model\n
- Build sequential deep learning models\n
- Improve fraud detection performance