In [1]:
import pandas as pd
import sys
import os

# Add the src directory to the path so we can import our modules
sys.path.append(os.path.abspath('..'))

In [2]:
import os
# os.getcwd()
# # set to 1 level up
os.chdir(os.path.dirname(os.getcwd()))
# os.getcwd()


In [3]:
merchants = pd.read_csv("data/merchants.csv")
merchants=merchants.rename(columns={'country': 'merchant_country'})
merchants.head()

Unnamed: 0,merchant_id,category,merchant_country,trust_score,number_of_alerts_last_6_months,avg_transaction_amount,account_age_months,has_fraud_history
0,M0001,travel,Austria,1.0,3,97.23,84,0
1,M0002,clothing,Poland,0.689695,2,142.71,93,1
2,M0003,electronics,Czech Republic,0.83461,5,67.83,65,0
3,M0004,electronics,Belgium,0.607229,1,16.77,9,1
4,M0005,grocery,Ireland,0.641901,2,38.56,100,0


In [4]:
users = pd.read_csv("data/users.csv")
users=users.rename(columns={'country': 'user_country'})
users.head()

Unnamed: 0,user_id,age,sex,education,primary_source_of_income,sum_of_monthly_installments,sum_of_monthly_expenses,user_country,signup_date,risk_score
0,U00001,56,Other,High School,Employment,477.69,243.18,Finland,2021-04-01,0.571079
1,U00002,36,Female,Bachelor,Business,31.6,737.76,France,2020-07-07,0.705268
2,U00003,70,Other,High School,Student Aid,275.72,1477.5,Finland,2022-08-23,0.429579
3,U00004,38,Other,High School,Savings,288.29,1135.03,Belgium,2022-05-21,0.581384
4,U00005,61,Male,High School,Employment,7.0,892.76,Italy,2022-03-15,0.420227


In [5]:
transactions = pd.read_json("data/transactions.json", lines=True)
transactions.head()

Unnamed: 0,transaction_id,timestamp,user_id,merchant_id,amount,channel,currency,device,location,payment_method,is_international,session_length_seconds,is_first_time_merchant,is_fraud
0,TX000000,2022-06-17 23:28:00,U14804,M0314,130.03,in-store,EUR,Android,"{'lat': 40.057938, 'long': 14.959737}",debit_card,1,145,0,0
1,TX000001,2022-01-04 15:39:00,U16634,M0675,132.0,online,EUR,Android,"{'lat': 42.05935, 'long': 1.596625}",debit_card,1,32,1,0
2,TX000002,2022-09-09 21:58:00,U18005,M0479,8.65,online,EUR,Android,"{'lat': 50.923432, 'long': 21.795262}",credit_card,1,604,1,0
3,TX000003,2023-11-20 06:40:00,U13690,M0538,19.82,mobile,EUR,iOS,"{'lat': 56.833396, 'long': -9.881566}",credit_card,0,1031,1,0
4,TX000004,2022-04-28 08:08:00,U04642,M0128,101.92,in-store,EUR,Android,"{'lat': 54.042996, 'long': 2.168516}",credit_card,1,330,0,0


In [6]:
trans_user = pd.merge(transactions, users, on='user_id', how='left')

In [7]:
df_merged = pd.merge(trans_user, merchants, on='merchant_id', how='left')

## Feature Engineering

Adding time-based features for transaction analysis

In [8]:
# Import the feature engineering function
from src.feature_engineering.time_features import calculate_time_since_last_transaction

# Apply the function to calculate time since last transaction
df_merged = calculate_time_since_last_transaction(df_merged)

## Location-based Feature Engineering

Extract country and city information from location coordinates using vectorized operations

In [9]:
# Import the location feature engineering functions
from src.feature_engineering.geo_features import extract_location_info

df_merged = extract_location_info(df_merged, location_column='location')

# Display the new columns
print("\nSample rows with extracted location data:")
df_merged[['location', 'country', 'city', 'administrative_region_1', 'administrative_region_2']].head()

Loading formatted geocoded file...

Sample rows with extracted location data:

Sample rows with extracted location data:


Unnamed: 0,location,country,city,administrative_region_1,administrative_region_2
290822,"{'lat': 69.291741, 'long': -7.72625}",NO,Vestbygd,Vest-Agder,Farsund
197951,"{'lat': 54.710715, 'long': 15.368695}",TR,Beloren,Adiyaman,
187979,"{'lat': 58.897717, 'long': 30.25543}",RU,Novolabinskaya,Krasnodarskiy,
137034,"{'lat': 42.684202, 'long': 3.843828}",RU,Ostrovnoy,Murmansk,
401318,"{'lat': 44.296089, 'long': 15.749937}",BG,Strelcha,Pazardzhik,Obshtina Strelcha


## Convert Country Codes to Full Names

Convert ISO 3166-1 alpha-2 country codes to full country names so they would match other features

In [11]:
# Import the country code conversion function
from src.feature_engineering.geo_features import convert_country_codes_to_names

# Convert transaction country codes to full country names
df_merged = convert_country_codes_to_names(
    df_merged,
    country_col='country',
    new_col_name='transaction_country_name',
    keep_original=True
)

# Display results with the full country names
print("Sample rows with converted country names:")
df_merged[[
    'country', 'transaction_country_name',
    'user_country', 'merchant_country',
]].head(10)

Sample rows with converted country names:


Unnamed: 0,country,transaction_country_name,user_country,merchant_country
290822,NO,Norway,Finland,Belgium
197951,TR,Türkiye,Finland,Spain
187979,RU,Russian Federation,Finland,Spain
137034,RU,Russian Federation,Finland,Belgium
401318,BG,Bulgaria,Finland,Germany
95555,RU,Russian Federation,Finland,Germany
488800,UA,Ukraine,Finland,Spain
432505,RO,Romania,Finland,Belgium
381637,RU,Russian Federation,Finland,Germany
74117,NO,Norway,Finland,France


## Country Match Feature

Add a binary feature indicating whether the user's country matches the transaction country

In [None]:
# Import the country match feature function
from src.feature_engineering.geo_features import add_country_match_feature

# Apply the function to check if user country matches transaction country
df_merged = add_country_match_feature(
    df_merged, 
    user_country_col='user_country',
    transaction_country_col='transaction_country_name',
    new_col_name='is_user_transaction_country_match'
)


# Display the distribution of the new feature
print("Distribution of user-transaction country matches:")
df_merged['is_user_transaction_country_match'].value_counts()

# Show a sample of rows with the new feature
df_merged[['user_country', 'country', 'is_user_transaction_country_match']].head(10)

Distribution of user-transaction country matches:


Unnamed: 0,user_country,country,is_user_transaction_country_match,is_merchant_transaction_country_match
290822,Finland,NO,0,0
197951,Finland,TR,0,0
187979,Finland,RU,0,0
137034,Finland,RU,0,0
401318,Finland,BG,0,0
95555,Finland,RU,0,0
488800,Finland,UA,0,0
432505,Finland,RO,0,0
381637,Finland,RU,0,0
74117,Finland,NO,0,0


In [14]:
# Count the number of transactions by country
country_counts = df_merged['country'].value_counts()
print("Transaction counts by country:")
country_counts.head(10)

Transaction counts by country:


country
RU    60837
NO    53581
GB    40057
ES    30211
FR    27169
SE    27153
IT    26358
TR    25974
UA    25352
FI    20642
Name: count, dtype: int64