This notebook **FeatureEngineering_and_Train.ipynb** performs 
- preprocessing,
- feature engineering,
- train-test split,
- scaling,
- model training (Logistic Regression + Random Forest), and
- saves the models.

#### **🧾 1. Imports**

In [1]:
# Essential libraries
import pandas as pd
import numpy as np
import os
import joblib

# Scikit-learn modules for preprocessing, modeling, and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, average_precision_score, roc_auc_score

#### **📂 2. Load Cleaned Dataset**

In [2]:
# Load the pre-cleaned dataset
df = pd.read_csv('../../data/processed/cleaned_fraud_data.csv')

#### **🛠️ 3. Feature Engineering**

In [3]:
# Convert time fields to datetime objects
df['signup_time'] = pd.to_datetime(df['signup_time'])
df['purchase_time'] = pd.to_datetime(df['purchase_time'])

# Create a feature for time difference between signup and purchase
df['signup_to_purchase_sec'] = (df['purchase_time'] - df['signup_time']).dt.total_seconds()

# Weekend flag based on the day of the week
df['is_weekend'] = df['day_of_week'].isin(['Saturday', 'Sunday']).astype(int)

# Nighttime flag: 1 if hour is before 6 AM or after 10 PM
df['is_night'] = df['hour_of_day'].apply(lambda x: 1 if (x < 6 or x > 22) else 0)

# IP range (helps in detecting dynamic/static IPs)
df['ip_range'] = df['upper_bound_ip_address'] - df['lower_bound_ip_address']

# Average time since signup per user
avg_tx_time = df.groupby('user_id')['time_since_signup'].mean().rename("avg_tx_time_user")
df = df.merge(avg_tx_time, on='user_id', how='left')

# Transaction density = count / time
df['tx_density'] = df['transaction_count'] / (df['time_since_signup'] + 1e-6)

# Encode browser frequency (rare/unknown browsers may indicate fraud)
df['browser_freq'] = df['browser'].map(df['browser'].value_counts())

# One-hot encoding for categorical variables
df = pd.get_dummies(df, columns=['source', 'sex'], drop_first=True)

# Log transformations to reduce skew
df['log_purchase_value'] = np.log1p(df['purchase_value'])
df['log_tx_count'] = np.log1p(df['transaction_count'])

# Drop irrelevant or redundant columns
drop_cols = [
    'signup_time', 'purchase_time', 'user_id', 'device_id', 'ip_address',
    'country', 'browser', 'lower_bound_ip_address', 'upper_bound_ip_address',
    'purchase_value', 'transaction_count'
]
df.drop(columns=drop_cols, inplace=True)

# Shuffle the dataset to randomize the order
df = df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# Save the feature-engineered dataset
os.makedirs("../../data/processed", exist_ok=True)
df.to_csv("../../data/processed/engineered_fraud_data.csv", index=False)

#### **🧮 4. Prepare Features and Labels**

In [4]:
# Define features and target
X = df.drop(columns=['class'])
y = df['class']

# Split into training and test sets (stratified for class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

#### **⚖️ 5. Scale Features**

In [5]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the fitted scaler for later use (e.g., inference pipeline)
joblib.dump(scaler, "../../models/fraud_scaler.pkl")

['../../models/fraud_scaler.pkl']

#### **📊 6. Train Logistic Regression Model**

In [6]:
# Train a logistic regression model with class balancing
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Save the trained model
joblib.dump(lr_model, "../../models/logistic_regression_model.pkl")

['../../models/logistic_regression_model.pkl']

#### **🌲 7. Train Random Forest Model**

In [None]:
# Train a random forest classifier
rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42
)
rf_model.fit(X_train_scaled, y_train)

# Save the trained model
joblib.dump(rf_model, "../../models/random_forest_model.pkl", compress=9)

#### **📈 8. Model Evaluation (Validation Set)**

In [17]:
# Evaluate both models using F1 Score, AUC-PR, and ROC AUC
models = {
    "Logistic Regression": lr_model,
    "Random Forest": rf_model
}

for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    y_prob = model.predict_proba(X_test_scaled)[:, 1]

    f1 = f1_score(y_test, y_pred)
    auc_pr = average_precision_score(y_test, y_prob)
    roc_auc = roc_auc_score(y_test, y_prob)

    print(f"📌 {name}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC-PR: {auc_pr:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print("-" * 30)

📌 Logistic Regression
F1 Score: 0.7043
AUC-PR: 0.6424
ROC AUC: 0.7787
------------------------------
📌 Random Forest
F1 Score: 0.7043
AUC-PR: 0.6319
ROC AUC: 0.7666
------------------------------
