In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/openjdk-17.jdk/Contents/Home'
os.environ['PATH'] = f"{os.environ['JAVA_HOME']}/bin:{os.environ['PATH']}"

In [None]:
# Imports
import sys
import pandas as pd
from pathlib import Path

from cider.featurizer.dependencies import (
    filter_to_datetime,
    get_spammers_from_cdr_data,
    get_outlier_days_from_cdr_data, 
    get_static_diagnostic_statistics,
    get_timeseries_diagnostic_statistics)
from cider.featurizer.plotting import plot_timeseries_diagnostics
from cider.featurizer.inference import (
    identify_daytime,
    identify_weekend, 
    swap_caller_and_recipient,
    identify_and_tag_conversations,
    identify_active_days,
    get_number_of_contacts_per_caller,
    get_call_duration_stats,
    get_percentage_of_nocturnal_interactions,
    get_percentage_of_initiated_conversations,
    get_percentage_of_initiated_calls,
    get_text_response_time_delay_stats,
    get_text_response_rate,
    get_entropy_of_interactions_per_caller,
    get_outgoing_interaction_fraction_stats,
    get_interaction_stats_per_caller,
    get_inter_event_time_stats,
    get_pareto_principle_interaction_stats,
    get_pareto_principle_call_duration_stats,
    get_number_of_interactions_per_user,
    get_number_of_antennas,
    get_entropy_of_antennas_per_caller,
    get_radius_of_gyration,
    get_pareto_principle_antennas,
    get_average_num_of_interactions_from_home_antennas)
from cider.schemas import RechargeData, MobileDataUsageData, MobileMoneyTransactionData, CallDataRecordData


sys.path.insert(0, '..')
from deprecated.datastore import DataStore

In [None]:
# Get spark session

config_file = '../configs/config_new.yml'

datastore = DataStore(config_file)

In [None]:
# Load data
DATA_DIR = Path('../synthetic_data/')

cdr = pd.read_csv(DATA_DIR / 'cdr.csv')
cdr.rename(columns={
    "caller_antenna": "caller_antenna_id",
    "recipient_antenna": "recipient_antenna_id",
    "txn_type": "transaction_type",
    "international": "transaction_scope"}, 
    inplace=True)
cdr = cdr[~cdr.caller_antenna_id.isna()]
cdr["timestamp"] = pd.to_datetime(cdr["timestamp"])
cdr["caller_id"] = cdr["caller_id"].astype(str)
cdr["recipient_id"] = cdr["recipient_id"].astype(str)
cdr["recipient_antenna_id"] = cdr["recipient_antenna_id"].astype(str)


antennas = pd.read_csv(DATA_DIR / 'antennas.csv').dropna()


recharges = pd.read_csv(DATA_DIR / 'recharges.csv')
recharges["caller_id"] = recharges["caller_id"].astype(str)


mobile_money = pd.read_csv(DATA_DIR / 'mobilemoney.csv')
mobile_money["caller_id"] = mobile_money["caller_id"].astype(str)
mobile_money["recipient_id"] = mobile_money["recipient_id"].astype(str)
mobile_money.rename(columns={"txn_type": "transaction_type",
                             "sender_balance_before": "caller_balance_before",
                             "sender_balance_after": "caller_balance_after"}, inplace=True)
mobile_money.loc[mobile_money.recipient_id == 'nan', 'recipient_id'] = None
mobile_money['caller_balance_before'] = pd.to_numeric(mobile_money['caller_balance_before'], errors='coerce')
mobile_money['caller_balance_after'] = pd.to_numeric(mobile_money['caller_balance_after'], errors='coerce')
mobile_money.drop(mobile_money[mobile_money.caller_balance_before.isna() | mobile_money.caller_balance_after.isna()].index, inplace=True)
mobile_money.loc[
    mobile_money.transaction_type.isin(['cashin', 'cashout']), 
    ['recipient_id', 'recipient_balance_before', 'recipient_balance_after']] = None



mobile_data = pd.read_csv(DATA_DIR / 'mobiledata.csv')
mobile_data["caller_id"] = mobile_data["caller_id"].astype(str)

In [None]:
# Validate data
_ = [CallDataRecordData.model_validate(x) for x in cdr.to_dict(orient='records')]
_ = [RechargeData.model_validate(x) for x in recharges.to_dict(orient='records')]
_ = [MobileDataUsageData.model_validate(x) for x in mobile_data.to_dict(orient='records')]
_ = [MobileMoneyTransactionData.model_validate(x) for x in mobile_money.to_dict(orient='records')] # <-- TODO: not sure why this fails


In [None]:
# Filter to date range
filter_start_date = pd.to_datetime("2020-01-05")
filter_end_date = pd.to_datetime("2022-02-01")

cdr_filtered = filter_to_datetime(cdr, filter_start_date, filter_end_date)
recharges_filtered = filter_to_datetime(recharges, filter_start_date, filter_end_date)
mobile_money_filtered = filter_to_datetime(mobile_money, filter_start_date, filter_end_date)
mobile_data_filtered = filter_to_datetime(mobile_data, filter_start_date, filter_end_date)

In [None]:
# Remove spammers
spammer_threshold = 1.75
spammer_ids = get_spammers_from_cdr_data(
    cdr_filtered, 
    threshold_of_calls_per_day=spammer_threshold)

cdr_filtered = cdr_filtered[~cdr_filtered.caller_id.isin(spammer_ids)]
recharges_filtered = recharges_filtered[~recharges_filtered.caller_id.isin(spammer_ids)]
mobile_money_filtered = mobile_money_filtered[~mobile_money_filtered.caller_id.isin(spammer_ids)]
mobile_data_filtered = mobile_data_filtered[~mobile_data_filtered.caller_id.isin(spammer_ids)]


In [None]:
# Remove outlier days
z_score_threshold = 2.0
outlier_days = get_outlier_days_from_cdr_data(cdr_filtered, z_score_threshold)

cdr_filtered = cdr_filtered[~cdr_filtered.timestamp.dt.date.isin(outlier_days)]
recharges_filtered = recharges_filtered[~recharges_filtered.timestamp.dt.date.isin(outlier_days)]
mobile_money_filtered = mobile_money_filtered[~mobile_money_filtered.timestamp.dt.date.isin(outlier_days)]
mobile_data_filtered = mobile_data_filtered[~mobile_data_filtered.timestamp.dt.date.isin(outlier_days)]

In [None]:
# Get static diagnostics
cdr_diagnostics = get_static_diagnostic_statistics(cdr_filtered)
recharges_diagnostics = get_static_diagnostic_statistics(recharges_filtered)
mobile_money_diagnostics = get_static_diagnostic_statistics(mobile_money_filtered)
mobile_data_diagnostics = get_static_diagnostic_statistics(mobile_data_filtered)

In [None]:
# Get timeseries diagnostics
cdr_timeseries_diagnostics = get_timeseries_diagnostic_statistics(cdr_filtered)
recharges_timeseries_diagnostics = get_timeseries_diagnostic_statistics(recharges_filtered)
mobile_money_timeseries_diagnostics = get_timeseries_diagnostic_statistics(mobile_money_filtered)
mobile_data_timeseries_diagnostics = get_timeseries_diagnostic_statistics(mobile_data_filtered)

In [None]:
# Plot diagnostics
fig = plot_timeseries_diagnostics(
    cdr_timeseries_diagnostics,
    value_column="num_unique_callers",
    groupby_column="transaction_type",
    plot_title="CDR: Number of Unique Callers by Transaction Type"
    )
fig

In [None]:
# Identify daytime and weekend calls
cdr_spark = datastore.spark.createDataFrame(cdr_filtered) 
cdr_spark_with_daytime = identify_daytime(cdr_spark)
cdr_spark_with_weekend = identify_weekend(cdr_spark_with_daytime)

# Swap caller and recipient to get recipient-centric view
cdr_swapped_caller_recipient = swap_caller_and_recipient(cdr_spark_with_weekend) 

# Identify and tag conversations
cdr_tagged_conversations = identify_and_tag_conversations(cdr_swapped_caller_recipient, max_wait=3600)

In [None]:
# Featurize data

In [None]:
# Get number of active days
cdr_active_days = identify_active_days(cdr_tagged_conversations)

In [None]:
# Get number of contacts per caller
cdr_number_of_contacts_per_caller = get_number_of_contacts_per_caller(cdr_tagged_conversations)

In [None]:
# Get call duration stats
cdr_call_duration_stats = get_call_duration_stats(cdr_tagged_conversations)

In [None]:
# Get percentage of nocturnal calls
cdr_nocturnal_calls = get_percentage_of_nocturnal_interactions(cdr_tagged_conversations)

In [None]:
# Get percentage of initiated conversations
cdr_percentage_initiated_conversations = get_percentage_of_initiated_conversations(cdr_tagged_conversations)

In [None]:
# Get percentage of initiated calls
cdr_percentage_initiated_calls = get_percentage_of_initiated_calls(cdr_tagged_conversations)

In [None]:
# Get text response time statistics
cdr_text_response_time_delay_stats = get_text_response_time_delay_stats(cdr_tagged_conversations)

In [None]:
# Get text response rate
cdr_text_response_rate = get_text_response_rate(cdr_tagged_conversations)

In [None]:
# Get entropy of interactions per caller
cdr_entropy_of_interactions = get_entropy_of_interactions_per_caller(cdr_tagged_conversations)

In [None]:
# Get fraction of outgoing interactions
cdr_fraction_of_outgoing_interactions = get_outgoing_interaction_fraction_stats(cdr_tagged_conversations)

In [None]:
# Get interaction stats per caller
cdr_interaction_stats_per_caller = get_interaction_stats_per_caller(cdr_tagged_conversations)

In [None]:
# Get inter-event time statistics
cdr_inter_event_time_stats = get_inter_event_time_stats(cdr_tagged_conversations)

In [None]:
# Get pareto principle interaction stats
cdr_pareto_stats = get_pareto_principle_interaction_stats(cdr_tagged_conversations, percentage_threshold=0.8)

In [None]:
# Get pareto principle statistics for call duration
cdr_pareto_call_stats = get_pareto_principle_call_duration_stats(cdr_tagged_conversations, percentage_threshold=0.8)

In [None]:
# Get number of interactions per user
cdr_number_of_interactions = get_number_of_interactions_per_user(cdr_tagged_conversations)

In [None]:
# Get number of antennas per caller
cdr_number_of_antennas = get_number_of_antennas(cdr_tagged_conversations)

In [None]:
# Get entropy of antennas per caller
cdr_entropy_of_antennas = get_entropy_of_antennas_per_caller(cdr_tagged_conversations)

In [None]:
# Get radius of gyration: the number of callers' typical movement range
antennas.rename(columns={"antenna_id": "caller_antenna_id"}, inplace=True)
spark_antennas = datastore.spark.createDataFrame(antennas)
cdr_radius_of_gyration = get_radius_of_gyration(cdr_tagged_conversations, spark_antennas)

In [None]:
# Get pareto principle statistics for antennas
cdr_pareto_antennas = get_pareto_principle_antennas(cdr_tagged_conversations, percentage_threshold=0.8)

In [None]:
# Get average number of interactions from home antennas; 
# home antenna is defined as the antenna with the most interactions for a caller
cdr_home_antenna_interactions = get_average_num_of_interactions_from_home_antennas(cdr_tagged_conversations)