In [13]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np
import math
from hdfs import InsecureClient
import pandas as pd
from io import BytesIO

def haversine(lat, long, merch_lat, merch_long):
    R = 6371.0  # Earth radius in kilometers

    # Convert degrees to radians
    lat1_rad = math.radians(lat)
    lon1_rad = math.radians(long)
    lat2_rad = math.radians(merch_lat)
    lon2_rad = math.radians(merch_long)

    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    a = math.sin(dlat / 2) ** 2 + \
        math.cos(lat1_rad) * math.cos(lat2_rad) * \
        math.sin(dlon / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    distance = R * c
    return distance



In [None]:
client = InsecureClient('http://hadoop-namenode:9870', user='root')
with client.read('/data') as reader:
    df = pd.read_csv(reader)

In [None]:
current_date = pd.Timestamp('today').normalize()
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = ((current_date - df['dob']).dt.days / 365.25).astype(int)
df['distance'] = df.apply(lambda row: haversine(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)

df = df.drop(columns=["Unnamed: 0", "trans_date_trans_time", "trans_num", "dob", "unix_time","lat","long","merch_lat","merch_long"])



# Separate features and target
X = df.drop(columns=["is_fraud"])
y = df["is_fraud"]

# Detect categorical and numeric columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

In [14]:
categorical_cols,numeric_cols

(['merchant',
  'category',
  'first',
  'last',
  'gender',
  'street',
  'city',
  'state',
  'job'],
 ['cc_num', 'amt', 'zip', 'city_pop', 'age', 'distance'])

In [21]:
from imblearn.under_sampling import RandomUnderSampler
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols)
    ]
)

# Example: create pipeline for training with XGBoost
from xgboost import XGBClassifier

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(
        use_label_encoder=False,
        eval_metric="logloss",
        random_state=42
    ))
])
sampler = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = sampler.fit_resample(X, y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Fit model
model.fit(X_train, y_train)

# Evaluate quickly
print("Train score:", model.score(X_train, y_train))
print("Test score:", model.score(X_test, y_test))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Train score: 0.9908402031809477
Test score: 0.968031968031968


In [22]:
from sklearn.metrics import precision_score, recall_score
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)
print("Train precision:", precision_score(y_train, y_train_pred))
print("Train recall:", recall_score(y_train, y_train_pred))

print("Test precision:", precision_score(y_test, y_test_pred))
print("Test recall:", recall_score(y_test, y_test_pred))

Train precision: 0.9885629040278469
Train recall: 0.9931723563696919
Test precision: 0.9649238914626076
Test recall: 0.9713524317121919


In [23]:
import joblib

# Save
joblib.dump(model, "pipeline_xgb.pkl")

# # Load
# loaded_model = joblib.load("pipeline_xgb.pkl")

# # Predict
# loaded_model.predict(new_data)

['pipeline_xgb.pkl']