In [79]:
import sys
import os

sys.path.append(os.path.abspath('../scripts'))

In [75]:
import pandas as pd

df_rtm = pd.read_csv('../data/Simulated_RTM_Logs.csv')
df_ipdr = pd.read_csv('../data/Simulated_CDR_IPDR_Logs.csv')

df_rtm['timestamp'] = pd.to_datetime(df_rtm['timestamp'])
df_ipdr['timestamp'] = pd.to_datetime(df_ipdr['timestamp'])

df_rtm['date'] = df_rtm['timestamp'].dt.date
df_ipdr['date'] = df_ipdr['timestamp'].dt.date

df_rtm['hour'] = df_rtm['timestamp'].dt.hour
df_ipdr['hour'] = df_ipdr['timestamp'].dt.hour

df_rtm['duration_sec'] = pd.to_numeric(df_rtm['duration_sec'], errors='coerce')
df_ipdr['duration_sec'] = pd.to_numeric(df_ipdr['duration_sec'], errors='coerce')

df_rtm['fraud_label'] = df_rtm['fraud_label'].fillna('Unknown')
df_ipdr['fraud_label'] = df_ipdr['fraud_label'].fillna('Unknown')

rtm_summary = df_rtm.groupby(['subscriber_id', 'date', 'hour']).agg(
    app_duration=('duration_sec', 'sum'),
    rtm_fraud_label=('fraud_label', lambda x: x.mode().iloc[0] if not x.mode().empty else None)
).reset_index()

ipdr_summary = df_ipdr.groupby(['subscriber_id', 'date', 'hour']).agg(
    cdr_ipdr_duration=('duration_sec', 'sum'),
    ipdr_fraud_label=('fraud_label', lambda x: x.mode().iloc[0] if not x.mode().empty else None)
).reset_index()

In [80]:
# Fraud or not fraud

ipdr_summary['is_fraud'] = ipdr_summary['ipdr_fraud_label'].apply(
    lambda x: 'Fraud' if x != 'Benign' else 'Benign'
)

In [81]:
# Count of fraud for each application

app_fraud_counts = df_rtm.groupby(['app', 'fraud_label']).size().unstack(fill_value=0)
app_fraud_counts['total'] = app_fraud_counts.sum(axis=1)
app_fraud_counts = app_fraud_counts.sort_values(by='total', ascending=False)
app_fraud_counts

fraud_label,Benign,Financial,Identity,Investment,Psychological,total
app,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Telegram,134,9,11,7,14,175
Signal,139,6,12,8,5,170
WhatsApp,135,11,10,8,4,168
