In [1]:
import pandas as pd

# Load processed fraud data
fraud_df = pd.read_csv("../data/processed/fraud_processed.csv")

# Quick check
fraud_df.head()
fraud_df.info()
fraud_df['class'].value_counts(normalize=True)  # check imbalance

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129146 entries, 0 to 129145
Data columns (total 18 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   user_id                 129146 non-null  int64  
 1   signup_time             129146 non-null  object 
 2   purchase_time           129146 non-null  object 
 3   purchase_value          129146 non-null  int64  
 4   device_id               129146 non-null  object 
 5   source                  129146 non-null  object 
 6   browser                 129146 non-null  object 
 7   sex                     129146 non-null  object 
 8   age                     129146 non-null  int64  
 9   ip_address              129146 non-null  int64  
 10  class                   129146 non-null  int64  
 11  lower_bound_ip_address  129146 non-null  float64
 12  upper_bound_ip_address  129146 non-null  float64
 13  country                 129146 non-null  object 
 14  time_since_signup   

class
0    0.905007
1    0.094993
Name: proportion, dtype: float64

In [4]:
from sklearn.model_selection import train_test_split

# Features
X = fraud_df.drop(columns=['class'])  

# Target
y = fraud_df['class']

# Stratified train-test split (keep class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# ---------------------------
# Train-test split
# ---------------------------
# Features
X = fraud_df.drop(columns=['class'])

# Target
y = fraud_df['class']

# Stratified train-test split (keep class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# Column definitions
# ---------------------------
num_cols = ['purchase_value', 'age', 'time_since_signup', 'hour_of_day']  # corrected column name
cat_cols = ['sex', 'source', 'browser']

# ---------------------------
# Preprocessor
# ---------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ]
)

# ---------------------------
# Pipeline with Logistic Regression
# ---------------------------
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

# ---------------------------
# Train the model
# ---------------------------
clf.fit(X_train, y_train)

# ---------------------------
# Predict
# ---------------------------
y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:,1]

# ---------------------------
# Evaluate
# ---------------------------
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[15003  8373]
 [  703  1751]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.64      0.77     23376
           1       0.17      0.71      0.28      2454

    accuracy                           0.65     25830
   macro avg       0.56      0.68      0.52     25830
weighted avg       0.88      0.65      0.72     25830



In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# ---------------------------
# Column definitions
# ---------------------------
num_cols = ['purchase_value', 'age', 'time_since_signup', 'hour_of_day']  # fixed
cat_cols = ['sex', 'source', 'browser']

# ---------------------------
# Preprocessor
# ---------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ]
)

# ---------------------------
# Pipeline with Random Forest
# ---------------------------
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200, max_depth=10, class_weight='balanced', random_state=42
    ))
])

# ---------------------------
# Train the model
# ---------------------------
pipeline.fit(X_train, y_train)

# ---------------------------
# Predict
# ---------------------------
y_pred_rf = pipeline.predict(X_test)
y_prob_rf = pipeline.predict_proba(X_test)[:,1]

# ---------------------------
# Evaluate
# ---------------------------
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


Confusion Matrix:
 [[23376     0]
 [ 1103  1351]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.98     23376
           1       1.00      0.55      0.71      2454

    accuracy                           0.96     25830
   macro avg       0.98      0.78      0.84     25830
weighted avg       0.96      0.96      0.95     25830



In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np

# ---------------------------
# Column definitions
# ---------------------------
num_cols = ['purchase_value', 'age', 'time_since_signup', 'hour_of_day']  # fixed
cat_cols = ['sex', 'source', 'browser']

# ---------------------------
# Preprocessor
# ---------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ]
)

# ---------------------------
# Pipeline with Random Forest
# ---------------------------
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200, max_depth=10, class_weight='balanced', random_state=42
    ))
])

# ---------------------------
# Cross-validation
# ---------------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=skf, scoring='f1')

print("F1 scores:", scores)
print("Mean F1:", np.mean(scores))


F1 scores: [0.69588313 0.70928703 0.68821801 0.70261697 0.71644909]
Mean F1: 0.7024908455251105


In [15]:
import joblib
import os

# Ensure models directory exists
os.makedirs("../models", exist_ok=True)

# pipeline is your trained Pipeline (preprocessing + RF)
joblib.dump(pipeline, r"../models/rf_fraud_pipeline.pkl")
print("Random Forest pipeline saved successfully!")

Random Forest pipeline saved successfully!


In [16]:


fraud_df['is_fraud'] = np.where(fraud_df['class'] == 'fraud', 1, 0)
fraud_df['is_fraud'].value_counts(normalize=True)



is_fraud
0    1.0
Name: proportion, dtype: float64