In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [10]:
train_df = pd.read_csv("fraudTrain.csv")
test_df = pd.read_csv("fraudTest.csv")

In [13]:
drop_cols = ['Unnamed: 0', 'cc_num', 'first', 'last', 'gender', 'street', 'city', 'state',
             'zip', 'job', 'dob', 'trans_num', 'merchant']

train_df.drop(columns=drop_cols, inplace=True)
test_df.drop(columns=drop_cols, inplace=True)

In [23]:
# train_df.head(5)
print("All Columns : ",train_df.columns.tolist())
print("Types of category : ",sorted(train_df['category'].unique()))
print("Count of category : ",train_df['category'].value_counts())

All Columns :  ['category', 'amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud', 'trans_date_ts', 'category_index']
Types of category :  ['entertainment', 'food_dining', 'gas_transport', 'grocery_net', 'grocery_pos', 'health_fitness', 'home', 'kids_pets', 'misc_net', 'misc_pos', 'personal_care', 'shopping_net', 'shopping_pos', 'travel']
Count of category :  category
gas_transport     131659
grocery_pos       123638
home              123115
shopping_pos      116672
kids_pets         113035
shopping_net       97543
entertainment      94014
food_dining        91461
personal_care      90758
health_fitness     85879
misc_pos           79655
misc_net           63287
grocery_net        45452
travel             40507
Name: count, dtype: int64


In [17]:
# Convert datetime to UNIX timestamp (in seconds)
train_df['trans_date_ts'] = pd.to_datetime(train_df['trans_date_trans_time']).view('int64') // 10**9
test_df['trans_date_ts'] = pd.to_datetime(test_df['trans_date_trans_time']).view('int64') // 10**9

# Drop original datetime columns
train_df.drop(columns=['trans_date_trans_time'], inplace=True)
test_df.drop(columns=['trans_date_trans_time'], inplace=True)

In [18]:
train_df['category'] = train_df['category'].fillna('unknown')
test_df['category'] = test_df['category'].fillna('unknown')

In [19]:
# Encode categorical feature
le = LabelEncoder()
train_df['category_index'] = le.fit_transform(train_df['category'])
test_df['category_index'] = le.transform(test_df['category'])

In [20]:
# Features and target
features = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat',
            'merch_long', 'trans_date_ts', 'category_index']
X_train = train_df[features]
y_train = train_df['is_fraud']
X_test = test_df[features]
y_test = test_df['is_fraud']

In [21]:
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# from sklearn.linear_model import LogisticRegression

# log_reg = LogisticRegression(
#     max_iter=200,            # The maximum number of iterations taken for the solvers to converge
#     solver='lbfgs',          # Optimization algorithm. 'lbfgs' is efficient for large datasets
#     penalty='l2',            # Regularization technique ('l1', 'l2', 'elasticnet', 'none')
#     C=1.0,                   # Inverse of regularization strength; smaller values specify stronger regularization
#     random_state=42
# )

# from sklearn.ensemble import RandomForestClassifier

# random_forest = RandomForestClassifier(
#     n_estimators=20,         # The number of trees in the forest
#     max_depth=10,            # Maximum depth of each tree (limits overfitting)
#     max_features='sqrt',     # Number of features to consider for splitting at each node (sqrt or 'auto' for classification)
#     min_samples_split=2,     # The minimum number of samples required to split an internal node
#     min_samples_leaf=1,      # The minimum number of samples required to be at a leaf node
#     n_jobs=-1,               # Number of CPU cores to use (-1 means using all cores)
#     random_state=42
# )

# from sklearn.ensemble import GradientBoostingClassifier

# gradient_boost = GradientBoostingClassifier(
#     n_estimators=20,        # Number of boosting stages
#     learning_rate=0.1,      # Step size shrinking to prevent overfitting
#     max_depth=3,            # Maximum depth of individual trees (tree complexity)
#     min_samples_split=2,    # Minimum samples required to split a node
#     min_samples_leaf=1,     # Minimum samples required at leaf node
#     random_state=42
# )

# from sklearn.tree import DecisionTreeClassifier

# decision_tree = DecisionTreeClassifier(
#     criterion='gini',       # The function to measure the quality of a split ('gini' or 'entropy')
#     max_depth=10,           # Maximum depth of the tree
#     min_samples_split=2,    # Minimum number of samples required to split a node
#     min_samples_leaf=1,     # Minimum number of samples required at a leaf node
#     random_state=42
# )

# from sklearn.neural_network import MLPClassifier

# mlp = MLPClassifier(
#     hidden_layer_sizes=(50, 25),   # The number of neurons in each hidden layer
#     activation='relu',              # Activation function for the hidden layers ('relu', 'tanh', 'logistic')
#     solver='adam',                  # The solver for weight optimization ('adam', 'sgd', 'lbfgs')
#     max_iter=200,                   # Maximum number of iterations
#     alpha=0.0001,                   # L2 penalty (regularization term)
#     random_state=42
# )

# from sklearn.svm import SVC

# svc = SVC(
#     kernel='linear',             # Specifies the kernel type ('linear', 'poly', 'rbf', 'sigmoid')
#     C=1.0,                       # Regularization parameter. Higher values = less regularization
#     gamma='scale',               # Kernel coefficient for 'rbf', 'poly', and 'sigmoid' kernels
#     probability=True,            # Enable probability estimates for prediction
#     random_state=42
# )

# import xgboost as xgb

# xgboost_model = xgb.XGBClassifier(
#     n_estimators=50,          # Number of boosting rounds
#     max_depth=6,              # Maximum depth of each tree
#     learning_rate=0.1,        # Step size for each boosting round
#     subsample=0.8,            # Fraction of training samples to use for each tree
#     colsample_bytree=0.8,     # Fraction of features to use for each tree
#     random_state=42
# )

# models = {
#     "Logistic Regression": log_reg,
#     "Random Forest": random_forest,
#     "Gradient Boosted Trees": gradient_boost,
#     "Decision Tree": decision_tree,
#     "MLP Classifier": mlp,
#     "Linear SVM": svc,
#     "XGBoost": xgboost_model
# }

# fitted_models = {}
# for name, model in models.items():
#     print(f"\nTraining {name}...")
#     model.fit(X_train, y_train)
#     fitted_models[name] = model


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import xgboost as xgb

models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Random Forest": RandomForestClassifier(
        n_estimators=20,
        max_depth=10,
        max_features='sqrt',
        n_jobs=-1,
        random_state=42
    ),
    "Gradient Boosted Trees": GradientBoostingClassifier(n_estimators=20),
    "Decision Tree": DecisionTreeClassifier(),
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(10, 5), max_iter=300),
    "Linear SVM": SVC(kernel="linear", probability=True),
    "XGBoost": xgb.XGBClassifier(n_estimators=50, use_label_encoder=False, eval_metric='logloss')
}

fitted_models = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    fitted_models[name] = model




Training Logistic Regression...

Training Random Forest...

Training Gradient Boosted Trees...

Training Decision Tree...

Training MLP Classifier...

Training Linear SVM...


In [None]:
def evaluate(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    print(f"\n📊 Evaluation for {name}")
    print("ROC AUC Score:", roc_auc_score(y_test, y_prob) if y_prob is not None else "N/A")
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

for name, model in fitted_models.items():
    evaluate(name, model, X_test, y_test)


In [None]:
user_input = {
    'amt': 100.0,
    'lat': 37.7749,
    'long': -122.4194,
    'city_pop': 50000,
    'unix_time': 1325376018,
    'merch_lat': 37.0,
    'merch_long': -122.0,
    'trans_date_ts': 1577836800,
    'category': 'misc_pos'
}

# Prepare user input
user_input['category_index'] = le.transform([user_input['category']])[0]
user_X = pd.DataFrame([user_input])[features]
user_X = scaler.transform(user_X)

# Predict with all models
preds = []
probs = []

for name, model in fitted_models.items():
    prob = model.predict_proba(user_X)[0][1] if hasattr(model, "predict_proba") else None
    pred = model.predict(user_X)[0]
    preds.append(pred)
    if prob is not None:
        probs.append(prob)

# Majority vote
final_vote = round(sum(preds) / len(preds))
final_prob = np.mean(probs)

print(f"\n🧠 Ensemble Prediction for User Input:")
print(f"Average Probability of Fraud: {final_prob:.4f}")
print(f"Ensembled (Majority) Prediction: {'Fraud' if final_vote == 1 else 'Legit'}")

