In [None]:
%pip install pandas numpy scikit-learn xgboost geopy


In [20]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score
from xgboost import XGBClassifier
from geopy.distance import geodesic
from sklearn.model_selection import cross_val_score



In [21]:
df = pd.read_csv("fraudTrain.csv")  # Replace with your actual file name
df.head()


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [22]:
# Convert datetime columns
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob'] = pd.to_datetime(df['dob'])

# Calculate age
df['age'] = df['trans_date_trans_time'].dt.year - df['dob'].dt.year

# Extract time-based features
df['trans_hour'] = df['trans_date_trans_time'].dt.hour
df['trans_day'] = df['trans_date_trans_time'].dt.day
df['trans_month'] = df['trans_date_trans_time'].dt.month


In [None]:
def calc_distance(row):
    return geodesic((row['lat'], row['long']), (row['merch_lat'], row['merch_long'])).km

df['distance'] = df.apply(calc_distance, axis=1)


In [None]:
# Binning age
def age_bin(age):
    if age < 20:
        return 'Under 20'
    elif age < 30:
        return '20-29'
    elif age < 40:
        return '30-39'
    elif age < 50:
        return '40-49'
    else:
        return '50+'

df['age_bin'] = df['age'].apply(age_bin)

# Gender Mapping
df['gender'] = df['gender'].map({'M': 'Male', 'F': 'Female'})


In [None]:
features = [
    'amt', 'city_pop', 'category', 'state', 'gender', 'age_bin',
    'distance', 'trans_hour', 'trans_day', 'trans_month'
]

X = df[features]
y = df['is_fraud']


In [None]:
X = pd.get_dummies(X, columns=['category', 'state', 'gender', 'age_bin'], drop_first=True)


In [None]:
scaler = StandardScaler()
X[['amt', 'city_pop', 'distance', 'trans_hour', 'trans_day', 'trans_month']] = scaler.fit_transform(
    X[['amt', 'city_pop', 'distance', 'trans_hour', 'trans_day', 'trans_month']]
)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Handle class imbalance
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos
print(f"Negative: {neg}, Positive: {pos}, scale_pos_weight: {scale_pos_weight:.2f}")


Negative: 1031335, Positive: 6005, scale_pos_weight: 171.75


In [None]:
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    max_depth=5,
    learning_rate=0.05,
    n_estimators=200,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Cross-validated AUC score
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1)
print(f"Cross-validated AUC: {cv_scores.mean():.4f}")




KeyboardInterrupt: 

In [None]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))


Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    257834
           1       0.96      0.84      0.90      1501

    accuracy                           1.00    259335
   macro avg       0.98      0.92      0.95    259335
weighted avg       1.00      1.00      1.00    259335

ROC AUC Score: 0.9988923327781195


In [None]:
importances = model.feature_importances_
feat_names = X.columns

plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feat_names)
plt.title("XGBoost Feature Importance")
plt.tight_layout()
plt.show()


In [None]:
def preprocess_input(data, template_df, scaler):
    df_input = pd.DataFrame([data])
    df_input['trans_hour'] = 12
    df_input['trans_day'] = 15
    df_input['trans_month'] = 6
    df_input['gender'] = 'Male'
    df_input['age_bin'] = '30-39'
    df_input['distance'] = 10

    # One-hot encode
    df_input = pd.get_dummies(df_input, columns=['category', 'state', 'gender', 'age_bin'], drop_first=True)

    # Align with training features
    for col in template_df.columns:
        if col not in df_input.columns:
            df_input[col] = 0
    df_input = df_input[template_df.columns]

    # Scale numeric values
    numeric_cols = ['amt', 'city_pop', 'distance', 'trans_hour', 'trans_day', 'trans_month']
    df_input[numeric_cols] = scaler.transform(df_input[numeric_cols])

    return df_input


Fraud Probability: 0.00
Prediction: NOT FRAUD


In [None]:
custom_input = {
    "cc_num": 1234567890123456,
    "merchant": "Amazon",
    "category": "Shopping",
    "amt": 100.50,
    "city_pop": 50000,
    "state": "CA"
}

processed = preprocess_input(custom_input, X_train, scaler)
prob = model.predict_proba(processed)[0][1]
prediction = "FRAUD" if prob > 0.5 else "NOT FRAUD"

print(f"Fraud Probability: {prob:.2f} → Prediction: {prediction}")
