In [1]:
from datetime import datetime
from math import ceil
import numpy as np
import pandas as pd

In [2]:
pd.options.display.max_columns = None

# Meta Features

Here I extract meta features from the anonymised the synethic dataset (see `Anonymisation.ipynb`). This notebook demonstrates the techniques on a sample of the dataset. I have not made a file which applies all of these methods at once as it is likely some features won't be used and other will want to be tweaked.

## TOC
 * [Time](#Time)
   * [Hour of Day](#hr_of_day)
 * [Currency Conversion](#Currency-Conversion)
 * [Transcations per Entity](#Transcations-per-Entity)
 * [Time Since Last Transaction](#Time-Since-Last-Transaction)
 * [Transaction Value Statistics by Entity](#Transaction-Value-Statistics-by-Entity)
 * [Number of Transaction on Day by Entity](#Number-of-Transaction-on-Day-by-Entity)

In [3]:
# Load the data
clean_data=pd.read_csv("data/cleaned_synthetic_data.csv",index_col=0)

sample_df=clean_data.sample(n=10)
prepared_df=sample_df.copy()

  mask |= (ar1 == a)


In [4]:
sample_df.head(5)

Unnamed: 0,is_fraud,unix_time,amt,cc_id,person_id,gender_id,job_category,age,city_pop_cluster_id,merchant_id,merchant_category
791792,0,1354438136,3.67,662,884,1,-1,41,43,302,misc_net
1096245,0,1364740080,44.86,33,387,1,-1,22,63,205,food_dining
39160,0,1327343194,71.89,446,418,0,113,41,4,163,personal_care
869635,0,1355971211,68.61,726,384,1,-1,30,62,625,gas_transport
699470,0,1351218312,327.73,66,71,0,-1,59,3,27,shopping_net


## Time

In the real data set time is given in seconds since the first transaction. I implement the function `standardise_time` which calculates the number of seconds between mid-night on the day of the first transaction and each transaction.

In [5]:
def standardise_time(series) -> pd.Series:
    min_time=datetime.utcfromtimestamp(series.min())
    min_day=min_time.replace(second=0,minute=0,hour=0)
    return ((series-min_day.timestamp())).astype(int)

In [6]:
prepared_df["seconds_from_start"]=standardise_time(sample_df["unix_time"])

In [7]:
prepared_df.head(10)

Unnamed: 0,is_fraud,unix_time,amt,cc_id,person_id,gender_id,job_category,age,city_pop_cluster_id,merchant_id,merchant_category,seconds_from_start
791792,0,1354438136,3.67,662,884,1,-1,41,43,302,misc_net,27852536
1096245,0,1364740080,44.86,33,387,1,-1,22,63,205,food_dining,38154480
39160,0,1327343194,71.89,446,418,0,113,41,4,163,personal_care,757594
869635,0,1355971211,68.61,726,384,1,-1,30,62,625,gas_transport,29385611
699470,0,1351218312,327.73,66,71,0,-1,59,3,27,shopping_net,24632712
838455,0,1355375147,39.45,287,915,0,-1,35,18,158,grocery_net,28789547
1199912,0,1368748107,1.19,729,399,0,-1,24,6,408,personal_care,42162507
24309,0,1326594126,64.83,405,680,0,-1,38,81,691,gas_transport,8526
1101168,0,1364961914,8.42,750,653,0,349,90,21,224,shopping_pos,38376314
686171,0,1350695454,42.59,612,254,1,-1,35,41,291,grocery_net,24109854


<a id="hr_of_day"></a>
**Hour of Day**

In [8]:
prepared_df["hour_of_day"]=pd.to_datetime(sample_df["unix_time"],unit="s").dt.hour

## Currency Conversion
All `amt` values are in dollars, but the real data we have is from Europe so we need to account for the exchange rate. The functions below allow for the conversion between any currency.

In [9]:
# !pip install forex-python
import forex_python.converter as fx

In [10]:
def convert_currency(amount:float,date:datetime,cur_currency:str,tar_currency) -> float:
    """
    Determine the value of an amount of one currency in another currency at a specified point in time
    
    PARAMETERS
    amount (float) - amount of current currency
    date (datetime) - date of exchange rate to use
    cur_currency (str) - three character code for current currency
    tar_currency (str) - three character code for target currency
    
    RETURNS
    float - amount of target currency
    """
    exchange_rate=fx.get_rate(cur_currency,tar_currency,date)
    return round(amount*exchange_rate,2)

In [11]:
def prepare_amount(df,cur_label,cur_currency="USD",tar_currency="GBP") -> pd.Series:
    """
    Convert amounts in a dataframe between currencies, using the exchange rate at the start of the date on which transaction occurred
    NOTE - conversion rate taken at start of day for speed.
    
    PARAMETERS
    df (pd.Dataframe) - dataframe of transactions with at least ["data",cur_label] columns
    cur_label (str) - name of column which contains amounts to convert
    """
    df_local=df.copy(deep=True)
    df_local["date"]=pd.to_datetime(df["unix_time"],unit="s").dt.date
    
    # determine the exchange rate for each day
    exchange_rates=pd.DataFrame()
    exchange_rates["date"]=pd.to_datetime(df_local["date"].unique(),format="%Y-%m-%d")
    exchange_rates["rate"]=exchange_rates.apply(lambda x:convert_currency(1,x["date"],cur_currency,tar_currency),axis=1)
    
    # merge dataframes
    exchange_rates["date"]=exchange_rates["date"].dt.date
    df_merged=df_local[["date","amt"]].reset_index().merge(exchange_rates[["date","rate"]],on="date",how="left").set_index('index')

    # calculated exchanged amounts
    tar_label="amount_{}".format(tar_currency)
    df_merged[tar_label]=df_merged.apply(lambda x:round(x["amt"]*x["rate"],2),axis=1)

    return df_merged[tar_label]

In [12]:
prepared_df["amount_USD"]=sample_df["amt"].copy()
prepared_df["amount_GBP"]=prepare_amount(sample_df[["unix_time","amt"]],"amt","USD","GBP")

In [13]:
prepared_df.head(5)

Unnamed: 0,is_fraud,unix_time,amt,cc_id,person_id,gender_id,job_category,age,city_pop_cluster_id,merchant_id,merchant_category,seconds_from_start,hour_of_day,amount_USD,amount_GBP
791792,0,1354438136,3.67,662,884,1,-1,41,43,302,misc_net,27852536,8,3.67,2.28
1096245,0,1364740080,44.86,33,387,1,-1,22,63,205,food_dining,38154480,14,44.86,29.61
39160,0,1327343194,71.89,446,418,0,113,41,4,163,personal_care,757594,18,71.89,46.01
869635,0,1355971211,68.61,726,384,1,-1,30,62,625,gas_transport,29385611,2,68.61,41.85
699470,0,1351218312,327.73,66,71,0,-1,59,3,27,shopping_net,24632712,2,327.73,203.19


## Transcations per Entity

In [14]:
# Total number of transactions performed by each entity (person or merchant) in the dataset
def transactions_per_entity(ids) -> pd.Series:
    """ids - series of either `person_id` or `merchant_id`"""
    pp_trans=ids.value_counts()
    return ids.apply(lambda x: pp_trans[x])

prepared_df["transaction_by_person"]=transactions_per_entity(sample_df["person_id"])
prepared_df["transaction_by_merchant"]=transactions_per_entity(sample_df["merchant_id"])

## Time Since Last Transaction

In [15]:
# time since last transaction (merchant and customer)
def time_since_last_transaction(id_col,df) -> pd.Series:
    """
    id_col (str) - name of column which contain ids to group by
    df (pd.DataFrame) - dataframe containing `unix_time` and `id_col`
       NOTE -1 = first transaction on record
    """
    times=df[id_col].copy(deep=True)
    
    clean_data["time_since_last_transaction_person"]=-1
    for id_code in df[id_col].unique():
        trans_times=df[df[id_col]==id_code]["unix_time"]
        times.loc[df[id_col]==id_code]=trans_times.diff()

    return times.replace(np.nan,-1).astype(int)

In [16]:
# clean_data["time_since_last_transaction_person"]=time_since_last_transaction("person_id",clean_data[["person_id","unix_time"]])
# clean_data["time_since_last_transaction_merchant"]=time_since_last_transaction("merchant_id",clean_data[["merchant_id","unix_time"]])

prepared_df["time_since_last_transaction_person"]=time_since_last_transaction("person_id",sample_df[["person_id","unix_time"]])
prepared_df["time_since_last_transaction_merchant"]=time_since_last_transaction("merchant_id",sample_df[["merchant_id","unix_time"]])

## Transaction Value Statistics by Entity

In [17]:
# mean/min/max amt per merchant/customer
# NOTE this is USD val so maybe change
def entity_amount_statistic(id_col,df,agg_calc) -> pd.Series:
    
    group_by=df[[id_col,"amt"]].groupby([id_col])
    vals=group_by["amt"].agg(agg_calc)
    return df[id_col].transform(lambda x:vals[x])

prepared_df["mean_amt_person"]=entity_amount_statistic("person_id",sample_df[["person_id","amt"]],"mean")
prepared_df["max_amt_merchant"]=entity_amount_statistic("merchant_id",sample_df[["merchant_id","amt"]],"max")

## Transaction Value Statistics by Entity by Day

In [18]:
# mean/min/max amt per merchant/customer
# NOTE this is USD val so maybe change
def entity_amount_statistic_by_day(id_col,df,agg_calc) -> pd.Series:
    
    df_copy=df[[id_col,"amt","unix_time"]].copy()
    df_copy["date"]=pd.to_datetime(df_copy["unix_time"],unit="s").dt.date
    
    group_by=df_copy.groupby([id_col,"date"],as_index=False)
    vals=group_by["amt"].agg(agg_calc)
    
    df_copy=df_copy.reset_index().merge(vals,on=[id_col,"date"],how='inner',suffixes=("_orig","_group_by")).set_index("index")
    
    return df_copy["amt_group_by"]

prepared_df["mean_amt_merchant_on_day"]=entity_amount_statistic_by_day("merchant_id",sample_df[["merchant_id","amt","unix_time"]],"mean")
prepared_df["max_amt_person_on_day"]=entity_amount_statistic_by_day("person_id",sample_df[["person_id","amt","unix_time"]],"mean")

## Number of Transaction on Day by Entity

In [19]:
# Number of Transactions done on same say by Entity
def transaction_on_date(id_col,df) -> pd.Series:
    
    df_copy=df[[id_col,"unix_time"]].copy()
    df_copy["date"]=pd.to_datetime(df_copy["unix_time"],unit="s").dt.date
    
    group_by=df_copy[[id_col,"date"]].groupby(["person_id","date"],as_index=False)
    counts=group_by.size()
    
    df_copy=df_copy.reset_index().merge(counts,on=[id_col,"date"],how='inner',suffixes=("_orig","_group_by")).set_index("index")
    
    return df_copy["size"]
    
prepared_df["transactions_on_day_person"]=transaction_on_date("person_id",sample_df[["person_id","unix_time"]])

## Save File

In [20]:
prepared_df

Unnamed: 0,is_fraud,unix_time,amt,cc_id,person_id,gender_id,job_category,age,city_pop_cluster_id,merchant_id,merchant_category,seconds_from_start,hour_of_day,amount_USD,amount_GBP,transaction_by_person,transaction_by_merchant,time_since_last_transaction_person,time_since_last_transaction_merchant,mean_amt_person,max_amt_merchant,mean_amt_merchant_on_day,max_amt_person_on_day,transactions_on_day_person
791792,0,1354438136,3.67,662,884,1,-1,41,43,302,misc_net,27852536,8,3.67,2.28,1,1,-1,-1,3.67,3.67,3.67,3.67,1
1096245,0,1364740080,44.86,33,387,1,-1,22,63,205,food_dining,38154480,14,44.86,29.61,1,1,-1,-1,44.86,44.86,44.86,44.86,1
39160,0,1327343194,71.89,446,418,0,113,41,4,163,personal_care,757594,18,71.89,46.01,1,1,-1,-1,71.89,71.89,71.89,71.89,1
869635,0,1355971211,68.61,726,384,1,-1,30,62,625,gas_transport,29385611,2,68.61,41.85,1,1,-1,-1,68.61,68.61,68.61,68.61,1
699470,0,1351218312,327.73,66,71,0,-1,59,3,27,shopping_net,24632712,2,327.73,203.19,1,1,-1,-1,327.73,327.73,327.73,327.73,1
838455,0,1355375147,39.45,287,915,0,-1,35,18,158,grocery_net,28789547,5,39.45,24.46,1,1,-1,-1,39.45,39.45,39.45,39.45,1
1199912,0,1368748107,1.19,729,399,0,-1,24,6,408,personal_care,42162507,23,1.19,0.79,1,1,-1,-1,1.19,1.19,1.19,1.19,1
24309,0,1326594126,64.83,405,680,0,-1,38,81,691,gas_transport,8526,2,64.83,42.14,1,1,-1,-1,64.83,64.83,64.83,64.83,1
1101168,0,1364961914,8.42,750,653,0,349,90,21,224,shopping_pos,38376314,4,8.42,5.56,1,1,-1,-1,8.42,8.42,8.42,8.42,1
686171,0,1350695454,42.59,612,254,1,-1,35,41,291,grocery_net,24109854,1,42.59,26.41,1,1,-1,-1,42.59,42.59,42.59,42.59,1


In [21]:
def save_data(df:pd.DataFrame,file_path):
    df.to_csv(file_path)