In [112]:
import sys
import os

from datetime import datetime

import polars as pl
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix


src_path = os.path.abspath(os.path.join(os.getcwd(), "..", "src"))
sys.path.append(src_path)
from utils import lift_metric_plot

In [3]:
pl_data = pl.read_csv("/Users/kurmangazykarabekov/Desktop/собесы/12Go Asia/data.csv")

# Prepare data

**Conver date from object to datetime**

In [4]:
pl_data = (
    pl_data.with_columns(
        pl.col("createdon")
        .str.strptime(
            pl.Datetime,
            format="%Y-%m-%d %H:%M:%S%z",
        )
        .cast(pl.Datetime("us", time_zone="UTC"))
    )
    .with_columns(
        pl.col("date_of_birth")
        .str.strptime(
            pl.Datetime,
            format="%Y-%m-%d",
        )
        .cast(pl.Datetime("us", time_zone="UTC"))
    )
    .with_columns(
        pl.col("godate")
        .str.strptime(
            pl.Datetime,
            format="%Y-%m-%d %H:%M:%S%z",
        )
        .cast(pl.Datetime("us", time_zone="UTC"))
    )
    .with_columns(
        pl.col("paidon")
        .str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S.%f%z", strict=False)
        .cast(pl.Datetime("us", time_zone="UTC"))
    )
)

  .str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S.%f%z", strict=False)


**Let's sort all the events so that we can further split the data by booking creation**

In [5]:
pl_data = pl_data.sort("createdon", descending=False)

**Prepare null values**

In [6]:
pl_data = pl_data.with_columns(
    pl.col("channel").fill_null("unknown"),
    pl.col("cust_name").fill_null("unknown"),
    pl.col("nationality").fill_null("unknown"),
    pl.col("payer_country").fill_null("unknown"),
    pl.col("proxy").fill_null(0.0),
    pl.col("tor").fill_null(0.0),
    pl.col("vpn").fill_null(0.0),
    pl.col("recent_abuse").fill_null(0.0),
)

In [7]:
print(f"Min date of booking {pl_data['createdon'].min()}")
print(f"Max date of booking {pl_data['createdon'].max()}")

Min date of booking 2021-04-30 17:21:17+00:00
Max date of booking 2023-11-30 16:59:52+00:00


# Features engineering

In [8]:
# 1. Features based on email_score and other score
pl_data = pl_data.with_columns(
    [
        (pl.col("email_score") * pl.col("passenger_score")).alias("score_interaction"),
        ((pl.col("email_score") + pl.col("passenger_score")) / 2).alias(
            "avg_risk_score"
        ),
        (pl.max_horizontal(["email_score", "passenger_score"])).alias("max_risk_score"),
        # Deviation from the average score
        (pl.col("email_score") - pl.col("email_score").mean()).alias(
            "email_score_deviation"
        ),
    ]
)
# 2. Price-based features (net price_thb)
pl_data = pl_data.with_columns(
    [  # Price per seat
        (pl.col("netprice_thb") / pl.col("seats")).alias("price_per_seat"),
        # Price deviation from the average for this type of transport
        (
            pl.col("netprice_thb") - pl.col("netprice_thb").mean().over("vehclass_id")
        ).alias("price_deviation"),
        # Percentage deviation of the price
        (
            (pl.col("netprice_thb") - pl.col("netprice_thb").mean().over("vehclass_id"))
            / pl.col("netprice_thb").mean().over("vehclass_id")
            * 100
        ).alias("price_deviation_percent"),
    ]
)
# 3. Security Features
pl_data = pl_data.with_columns(
    [
        (
            pl.col("proxy") + pl.col("vpn") + pl.col("tor") + pl.col("recent_abuse")
        ).alias("security_flags_sum"),
        (
            (pl.col("proxy") + pl.col("vpn") + pl.col("tor") + pl.col("recent_abuse"))
            > 0
        ).alias("has_security_flag"),
    ]
)

# 4. Features based on payment attempts (p_attempts)
pl_data = pl_data.with_columns(
    [
        (pl.col("p_attempts") > 2).alias("multiple_attempts"),
        (pl.col("p_attempts").log1p()).alias("log_attempts"),
    ]
)

# 5. Channel aggregations
channel_stats = pl_data.group_by("channel").agg(
    [
        (pl.col("netprice_thb").mean()).alias("channel_avg_price"),
        (pl.col("seats").mean()).alias("channel_avg_seats"),
        (pl.col("email_score").mean()).alias("channel_avg_score"),
    ]
)

pl_data = pl_data.join(channel_stats, on="channel")

# 6. Combined features for the type of transport (vehclass_id)
pl_data = pl_data.with_columns(
    [  # Average price per seat for this type of transport
        (
            pl.col("netprice_thb").mean().over("vehclass_id")
            / pl.col("seats").mean().over("vehclass_id")
        ).alias("vehclass_avg_price_per_seat"),
        # Deviation from the average price per seat
        (
            (pl.col("netprice_thb") / pl.col("seats"))
            - (
                pl.col("netprice_thb").mean().over("vehclass_id")
                / pl.col("seats").mean().over("vehclass_id")
            )
        ).alias("price_per_seat_deviation"),
    ]
)

# 7. Interactions between important features
pl_data = pl_data.with_columns(
    [
        (pl.col("email_score") * pl.col("netprice_thb")).alias(
            "score_price_interaction"
        ),
        (pl.col("email_score") * pl.col("security_flags_sum")).alias(
            "score_security_interaction"
        ),
    ]
)

# 8. Features for the insurance flag (insurance_flg)
pl_data = pl_data.with_columns(
    [  # Price-to-average ratio with/without insurance
        (
            pl.col("netprice_thb") / pl.col("netprice_thb").mean().over("insurance_flg")
        ).alias("price_to_insurance_avg"),
    ]
)

# 9. Creating time-related features
pl_data = pl_data.with_columns(
    [  # The difference between the creation and payment time (in hours)
        ((pl.col("paidon") - pl.col("createdon")).dt.total_seconds() / 3600).alias(
            "hours_to_pay"
        ),
        # The difference between the payment and the trip (in days)
        ((pl.col("godate") - pl.col("paidon")).dt.total_seconds() / (3600 * 24)).alias(
            "days_to_trip"
        ),
        # Difference between creation and trip (in days)
        (
            (pl.col("godate") - pl.col("createdon")).dt.total_seconds() / (3600 * 24)
        ).alias("days_from_creation_to_trip"),
    ]
)

# 10. We add more complex features over time
pl_data = pl_data.with_columns(
    [
        (pl.col("hours_to_pay") < 1 / 60).alias("instant_payment"),
        (pl.col("hours_to_pay") < 5 / 60).alias("suspicious_fast_payment"),
        (pl.col("hours_to_pay") > 24).alias("long_payment"),
        (pl.col("days_to_trip") < 7).alias("near_trip"),
        (pl.col("days_to_trip") > 90).alias("far_future_trip"),
    ]
)

# 11. Basic age features
pl_data = pl_data.with_columns(
    [
        ((pl.col("createdon") - pl.col("date_of_birth")).dt.days() / 365.25).alias(
            "age"
        ),
        ((pl.col("godate") - pl.col("date_of_birth")).dt.days() / 365.25).alias(
            "age_at_trip"
        ),
    ]
)

  ((pl.col("createdon") - pl.col("date_of_birth")).dt.days() / 365.25).alias(
  ((pl.col("godate") - pl.col("date_of_birth")).dt.days() / 365.25).alias(


# Split data by time

**To separate the data, we will use the date 2023-08-01 since the distribution of the target event will be approximately the same for training and testing**

In [48]:
date_split_utc = pl.datetime(2023, 8, 1, 0, 0, 0).cast(
    pl.Datetime("us", time_zone="UTC")
)

In [49]:
pl_data.filter(pl.col("createdon") < date_split_utc)[
    "isFraud"
].to_pandas().value_counts(normalize=True)

isFraud
0    0.99788
1    0.00212
Name: proportion, dtype: float64

In [50]:
pl_data.filter(pl.col("createdon") >= date_split_utc)[
    "isFraud"
].to_pandas().value_counts(normalize=True)

isFraud
0    0.997435
1    0.002565
Name: proportion, dtype: float64

In [51]:
pl_data.filter(pl.col("createdon") < date_split_utc)[
    "isFraud"
].to_pandas().value_counts()

isFraud
0    2380558
1       5058
Name: count, dtype: int64

In [52]:
pl_data.filter(pl.col("createdon") >= date_split_utc)[
    "isFraud"
].to_pandas().value_counts()

isFraud
0    1075839
1       2767
Name: count, dtype: int64

# Train model and prepare cat features for Isolation Forest

In [53]:
cat_features = ["channel", "nationality", "payer_country", "role_id", "vehclass_id"]

In [54]:
train_data = pl_data.filter(pl.col("createdon") < date_split_utc)
test_data = pl_data.filter(pl.col("createdon") >= date_split_utc)

In [55]:
X_train_iso = train_data.clone()
X_test_iso = test_data.clone()

encoders = {}

for cat_col in cat_features:
    le = LabelEncoder()
    encoders[cat_col] = le

    train_values = X_train_iso[cat_col].cast(str).to_numpy()
    encoded_train = le.fit_transform(train_values)
    X_train_iso = X_train_iso.with_columns(
        pl.Series(name=f"{cat_col}_encoded", values=encoded_train)
    )

    test_values = X_test_iso[cat_col].cast(str).to_numpy()
    encoded_test = [
        le.transform([x])[0] if x in le.classes_ else -1 for x in test_values
    ]
    X_test_iso = X_test_iso.with_columns(
        pl.Series(name=f"{cat_col}_encoded", values=encoded_test)
    )

In [56]:
print("Train shape:", X_train_iso.shape)
print("Test shape:", X_test_iso.shape)

Train shape: (2385616, 63)
Test shape: (1078606, 63)


In [74]:
type_features = pd.DataFrame({'Columns': X_train_iso.columns, 'Type': X_train_iso.dtypes})

In [79]:
final_features = X_train_iso.drop(
    columns=[
        "bid",
        "channel",
        "createdon",
        "cust_name",
        "date_of_birth",
        "email",
        "godate",
        "ip",
        "nationality",
        "paidon",
        "payer_name",
        "payer_country",
        "role_id",
        "useragent",
        "usr_name",
        "vehclass_id",
    ]
).columns

In [100]:
X_train_final = X_train_iso.select(final_features).to_pandas().fillna(0)
X_test_final = X_test_iso.select(final_features).to_pandas().fillna(0)
y_test = test_data["isFraud"].to_pandas()

In [113]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled = scaler.transform(X_test_final)

In [114]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_final.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_final.columns)

In [96]:
iso_forest = IsolationForest(
    n_estimators=100,
    contamination=0.00212, 
    random_state=42,
    n_jobs=-1,
)

In [115]:
iso_forest.fit(X_train_scaled)

In [116]:
y_pred = (iso_forest.predict(X_test_scaled) == -1).astype(int)

In [117]:
anomaly_scores = -iso_forest.score_samples(X_test_scaled)

In [118]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1075839
           1       0.30      0.33      0.31      2767

    accuracy                           1.00   1078606
   macro avg       0.65      0.66      0.66   1078606
weighted avg       1.00      1.00      1.00   1078606



In [119]:
print(confusion_matrix(y_test, y_pred))

[[1073680    2159]
 [   1852     915]]


In [120]:
results_df = test_data.to_pandas()
results_df["anomaly_score"] = anomaly_scores
results_df["predicted_fraud"] = y_pred

In [121]:
false_positives = results_df[
    (results_df["predicted_fraud"] == 1) & (results_df["isFraud"] == 0)
]
false_negatives = results_df[
    (results_df["predicted_fraud"] == 0) & (results_df["isFraud"] == 1)
]

print("\nFalse Positive Rate:", len(false_positives) / len(results_df))
print(
    "False Negative Rate:",
    len(false_negatives) / len(results_df[results_df["isFraud"] == 1]),
)


False Positive Rate: 0.002001657695210299
False Negative Rate: 0.6693169497650886


In [122]:
print("\nTop 10 most anomalous transactions:")
print(
    results_df.nlargest(10, "anomaly_score")[
        ["anomaly_score", "isFraud", "predicted_fraud", "email_score", "netprice_thb"]
    ]
)


Top 10 most anomalous transactions:
        anomaly_score  isFraud  predicted_fraud  email_score  netprice_thb
241522       0.751966        1                1         10.0  14287.441406
343349       0.750572        1                1         10.0  15589.012695
125226       0.745960        1                1         10.0  10929.082031
213723       0.743714        1                1         10.0  13225.910156
187424       0.741908        1                1          5.0  17436.582031
269220       0.741186        1                1         10.0   8027.323242
241528       0.741137        1                1         10.0  14287.441406
88543        0.740667        1                1         10.0   8997.291992
538258       0.739042        1                1         10.0   8970.461914
187345       0.738417        1                1          5.0  28198.468750


In [123]:
temp_df = X_test_final.copy()
temp_df["anomaly_score"] = anomaly_scores

correlations = temp_df.corrwith(temp_df["anomaly_score"])

feature_importance = pd.DataFrame(
    {
        "feature": correlations.index[:-1],  
        "correlation": correlations.values[:-1],
    }
)

print("\nFeature correlations with anomaly scores (top 10):")
print(feature_importance.sort_values("correlation", key=abs, ascending=False).head(10))

  c /= stddev[:, None]
  c /= stddev[None, :]



Feature correlations with anomaly scores (top 10):
                        feature  correlation
20           security_flags_sum     0.702468
21            has_security_flag     0.610482
8                         proxy     0.610474
9                  recent_abuse     0.587580
12                          vpn     0.551721
30   score_security_interaction     0.394026
45              role_id_encoded    -0.355283
27  vehclass_avg_price_per_seat     0.347881
23                 log_attempts     0.315284
5                    p_attempts     0.277389
