In [81]:
import pandas as pd
import numpy as np
from sklearn.neighbors import LocalOutlierFactor

In [315]:
og_data = pd.read_excel(rf'Files/TXS que el collector no ve NOV23 al final del mes.xlsx')
og_data = og_data.rename(columns={'Id_x':'Id'})

In [316]:
og_data = og_data.astype({'TransactionId':'str', 'SenderPhoneNumber':'str' })

In [3]:
df = og_data.copy()
df = df.set_index('CreatedAt')

X = pd.get_dummies(df[['Client', 'VendorCode', 
       'CollectMethod', 'TargetCountry',  'NetAmountUSD',
       'SourceCountry']])


from sklearn.ensemble import IsolationForest
IF = IsolationForest(random_state=0, contamination=0.1).fit(X)

df['Anomaly_scores']  = IF.decision_function(X)
df['is_anomaly']  = IF.predict(X)
df.sort_values(by='is_anomaly')
anomalias = df[df.is_anomaly==-1].sort_values(by='Anomaly_scores', ascending=True)

df.to_excel(rf'IF anomalies - NOV 23.xlsx', index=False)



# MSFT GUIDE

# Feature Engineering

In [317]:
og_data = og_data[og_data.Status=='COMPLETED']


In [318]:
og_data = og_data[['CreatedAt', 'Id', 'TransactionId', 'Client', 'VendorCode', 'TargetCountry', 'TargetCurrency', 'NetAmountUSD',
       'SourceCountry', 'SenderFirstName',
       'SenderLastName', 'SenderDocument', 'SenderPhoneNumber',
       'senderCountry', 'receiverFirstName', 'receiverLastName',
       'receiverDocument', 'receiverDocumentType', 'receiverPhoneNumber',
       'receiverCountry', 'receiverBankAccountNumber',
       'receiverBankAccountType']]

In [319]:
def get_zscore(value, mean, std):
    # calculate z-score or number of standard deviations from mean
    if (
        std == 0
        or std is None
        or str(std).lower() in ["nan", "none", "null"]
        or mean is None
    ):
        if value == 0.0:
            return 0.0
        elif value != 0:
            return value #np.log10(value + 1)
    ans = (value - mean) / std
    # only interested in increases
    #ans = max(0.0, ans)
    # take log to dampen numbers
    #ans = np.log10(ans + 1)
    return float(ans)

In [320]:
data = og_data.copy()
data['Date'] = data.CreatedAt.dt.date
zscore_columns = [
    "NetAmountUSD"
]

#usamos documento del enviador y phone number del recibidor porque son los que mas hay
ind = ['Date', 'Client', 'VendorCode', 'SourceCountry', 'TargetCountry', 'TargetCurrency', 
    'SenderDocument', 'receiverPhoneNumber', 'receiverBankAccountNumber']

means = [x + "_" + y + "_mean" for x in zscore_columns for y in ind]
stds = [x + "_" + y + "_std" for x in zscore_columns for y in ind]
zscores = [x + "_" + y + "_zscore" for x in zscore_columns for y in ind]

zscore = data[zscore_columns + ind]
zscore = zscore.fillna(0)


for metric, index in zip(means,ind):
    zscore[metric] = zscore.groupby([index])[zscore_columns].transform("mean")

for metric, index in zip(stds,ind):
    zscore[metric] = zscore.groupby([index])[zscore_columns].transform("std", ddof=1)


zscore = zscore.drop_duplicates(ind)
zscore = zscore[means + stds + ind]
data = data.merge(zscore, how="left", on=ind)


from itertools import product

for column,feature in product(zscore_columns, ind):
    data[f"{column}_{feature}_zscore"] = data.apply(
        lambda row: get_zscore(
            row[f"{column}"], row[f"{column}_{feature}_mean"], row[f"{column}_{feature}_std"]
        ),
        axis=1,
    )


### Counts by sender

In [321]:
sender_count_columns=['Id', 'Date', 'Client', 'VendorCode', 'TargetCountry', 'TargetCurrency',  'receiverPhoneNumber', 'receiverBankAccountNumber']
sender_count_ind = ['SenderDocument']

sender_counts = ["Sender_" + x + "_count" for x in sender_count_columns]

sender_count = data[sender_count_columns + sender_count_ind]
sender_count = sender_count.fillna(0)

for column  in sender_count_columns:
    #print(column)
    sender_count[f"Sender_{column}_count"] = sender_count.groupby('SenderDocument')[column].transform("nunique")

sender_count = sender_count.drop_duplicates(sender_count_ind)
sender_count = sender_count[sender_counts + sender_count_ind]

for att in sender_counts:
    if (sender_count[att].max()==1):
        sender_count.drop(columns={att}, inplace=True)


In [322]:
data = data.merge(sender_count, how="left", on=sender_count_ind)

### Counts by receiver

In [323]:
receiver_count_columns=['Id', 'Date', 'Client', 'VendorCode', 'SourceCountry', 'SenderDocument']
receiver_count_ind = ['receiverPhoneNumber']

receiver_counts = ["receiver_" + x + "_count" for x in receiver_count_columns]

receiver_count = data[receiver_count_columns + receiver_count_ind]
receiver_count = receiver_count.fillna(0)

for column  in receiver_count_columns:
    #print(column)
    receiver_count[f"receiver_{column}_count"] = receiver_count.groupby('receiverPhoneNumber')[column].transform("nunique")

receiver_count = receiver_count.drop_duplicates(receiver_count_ind)
receiver_count = receiver_count[receiver_counts + receiver_count_ind]

for att in receiver_counts:
    if (receiver_count[att].max()==1):
        receiver_count.drop(columns={att}, inplace=True)


In [324]:
data = data.merge(receiver_count, how="left", on=receiver_count_ind)

In [376]:
data.columns

Index(['CreatedAt', 'Id', 'TransactionId', 'Client', 'VendorCode',
       'TargetCountry', 'TargetCurrency', 'NetAmountUSD', 'SourceCountry',
       'SenderFirstName', 'SenderLastName', 'SenderDocument',
       'SenderPhoneNumber', 'senderCountry', 'receiverFirstName',
       'receiverLastName', 'receiverDocument', 'receiverDocumentType',
       'receiverPhoneNumber', 'receiverCountry', 'receiverBankAccountNumber',
       'receiverBankAccountType', 'Date', 'NetAmountUSD_Date_mean',
       'NetAmountUSD_Client_mean', 'NetAmountUSD_VendorCode_mean',
       'NetAmountUSD_SourceCountry_mean', 'NetAmountUSD_TargetCountry_mean',
       'NetAmountUSD_TargetCurrency_mean', 'NetAmountUSD_SenderDocument_mean',
       'NetAmountUSD_receiverPhoneNumber_mean',
       'NetAmountUSD_receiverBankAccountNumber_mean', 'NetAmountUSD_Date_std',
       'NetAmountUSD_Client_std', 'NetAmountUSD_VendorCode_std',
       'NetAmountUSD_SourceCountry_std', 'NetAmountUSD_TargetCountry_std',
       'NetAmountUSD_Ta

# Isolation Forest application

In [377]:
def apply_isolation_forest(df, n_estimators, contamination=0.01):
    """Applies Isolation Forest to a given dataset and returns the predicted anomalies."""
    clf = IsolationForest(
        n_estimators=n_estimators,
        max_samples="auto",
        contamination=contamination,
        max_features=6,
        bootstrap=False,
        n_jobs=-1,
        random_state=42,
        verbose=0,
    )
    clf.fit(df.values)
    pred = clf.predict(df.values)
    scores = clf.decision_function(df.values)
    return clf, pred, scores

# specify the metrics column names to be modelled
features = [
        'Client', 'VendorCode',
       'TargetCountry', 'TargetCurrency', 'NetAmountUSD', 'SourceCountry',
       'NetAmountUSD_Date_mean',
       'NetAmountUSD_Client_mean', 'NetAmountUSD_VendorCode_mean',
       'NetAmountUSD_SourceCountry_mean', 'NetAmountUSD_TargetCountry_mean',
       'NetAmountUSD_TargetCurrency_mean', 'NetAmountUSD_SenderDocument_mean',
       'NetAmountUSD_receiverPhoneNumber_mean',
       'NetAmountUSD_receiverBankAccountNumber_mean', 'NetAmountUSD_Date_std',
       'NetAmountUSD_Client_std', 'NetAmountUSD_VendorCode_std',
       'NetAmountUSD_SourceCountry_std', 'NetAmountUSD_TargetCountry_std',
       'NetAmountUSD_TargetCurrency_std', 'NetAmountUSD_SenderDocument_std',
       'NetAmountUSD_receiverPhoneNumber_std',
       'NetAmountUSD_receiverBankAccountNumber_std',
       'NetAmountUSD_Date_zscore', 'NetAmountUSD_Client_zscore',
       'NetAmountUSD_VendorCode_zscore', 'NetAmountUSD_SourceCountry_zscore',
       'NetAmountUSD_TargetCountry_zscore',
       'NetAmountUSD_TargetCurrency_zscore',
       'NetAmountUSD_SenderDocument_zscore',
       'NetAmountUSD_receiverPhoneNumber_zscore',
       'NetAmountUSD_receiverBankAccountNumber_zscore', 'Sender_Id_count',
       'Sender_Date_count', 'Sender_VendorCode_count',
       'Sender_receiverPhoneNumber_count',
       'Sender_receiverBankAccountNumber_count', 'receiver_Id_count',
       'receiver_Date_count', 'receiver_VendorCode_count',
       'receiver_SenderDocument_count'
]


In [378]:
X = data[features].copy()
X = X.fillna(0)

if X.shape[0] < 500:
    n_estimators = len(features) * 4 + X.shape[0] * 2
else:
    n_estimators = 100

# n_estimators = 100
print("Number of trees", n_estimators)

clf, pred, scores = apply_isolation_forest(X, n_estimators, contamination=0.01)
data["anomaly"] = pred
data["score"] = scores * -1
# excluding users who do not have any successful logon history.


Number of trees 100


ValueError: could not convert string to float: 'SANTANDERCHL'

In [None]:
data = data.loc[data["SuccessfulLogons"] > 0]
outliers = data.loc[data["anomaly"] == -1]
outlier_index = list(outliers.index)
print(f"Outliers at indexes: {outlier_index}")
# Find the number of anomalies and normal points here points classified -1 are anomalous
print(data["anomaly"].value_counts())