
# **Mount** Google Drive
# ===============================


In [109]:
##from google.colab import drive
##drive.mount('/content/drive')




# Install
# ===============================


In [110]:

#!ls /content/drive/MyDrive/Sem-6/coding/github/fraud_detection
project_path = "/content/drive/MyDrive/Sem-6/coding/github/fraud_detection/"
%cd $project_path
#!pip install -r requirements.txt

/content/drive/MyDrive/Sem-6/coding/github/fraud_detection


##import

In [111]:
import os
import pandas as pd
import yaml
import logging
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np


#Helper Code

##Logging

In [112]:
# Make sure results directory exists
os.makedirs("results", exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("results/data_prep.log")
    ]
)
logger = logging.getLogger(__name__)



##Load config

In [113]:
def load_config(config_path="configs/baseline.yaml"):
    """Load YAML config file"""
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    logger.info(f"Loaded config from {config_path}")
    return config

##Load  CDRs

In [114]:
def load_cdr(file_path, nrows=None):
    """Load a CDR CSV file with optional row limit (sample mode)."""
    logger.info(f"Loading file: {file_path} with nrows={nrows}")
    return pd.read_csv(file_path, nrows=nrows)


def load_all_data(config):
    """
    Load all CSVs defined in config into a dict of DataFrames.
    Uses training.sample_size if available.
    """
    base = config["dataset"]["base_path"]
    files = config["dataset"]["files"]
    sample_size = config.get("training", {}).get("sample_size", None)

    data = {}
    for name, fname in files.items():
        path = os.path.join(base, fname)
        df = load_cdr(path, nrows=sample_size)
        data[name] = df
        logger.info(f"Loaded {name} -> {df.shape} from {path}")
    return data


##Build Features

In [115]:

def build_feature_table(data: dict):
    """
    Merge APP, SMS, USER, VOC datasets on phone_no_m into a single feature table.
    Each dataset is aggregated per phone_no_m, then merged into one DataFrame.

    Parameters
    ----------
    data : dict
        Dictionary of raw DataFrames, e.g. {"app": df_app, "sms": df_sms, "user": df_user, "voc": df_voc}

    Returns
    -------
    feature_df : pd.DataFrame
        Merged feature table with one row per phone_no_m
    """

    feature_parts = []

    # --------------------------
    # APP features
    # --------------------------
    if "app" in data:
        df = data["app"].copy()
        app_feat = df.groupby("phone_no_m").agg(
            app_count=("busi_name", "nunique"),
            total_flow=("flow", "sum"),
            avg_flow=("flow", "mean")
        ).reset_index()
        feature_parts.append(app_feat)

    # --------------------------
    # SMS features
    # --------------------------
    if "sms" in data:
        df = data["sms"].copy()
        sms_feat = df.groupby("phone_no_m").agg(
            sms_count=("opposite_no_m", "count"),
            unique_contacts=("opposite_no_m", "nunique")
        ).reset_index()
        feature_parts.append(sms_feat)

    # --------------------------
    # USER features (static profile)
    # --------------------------
    if "user" in data:
        df = data["user"].copy()
        user_feat = df.drop_duplicates(subset=["phone_no_m"])
        feature_parts.append(user_feat)

    # --------------------------
    # VOC features (calls)
    # --------------------------
    if "voc" in data:
        df = data["voc"].copy()
        voc_feat = df.groupby("phone_no_m").agg(
            call_count=("opposite_no_m", "count"),
            unique_callers=("opposite_no_m", "nunique"),
            avg_call_dur=("call_dur", "mean"),
            total_call_dur=("call_dur", "sum")
        ).reset_index()
        feature_parts.append(voc_feat)

    # --------------------------
    # Merge all features
    # --------------------------
    from functools import reduce
    feature_df = reduce(
        lambda left, right: pd.merge(left, right, on="phone_no_m", how="outer"),
        feature_parts
    )

    # --------------------------
    # Handle missing values
    # --------------------------
    feature_df = feature_df.fillna(0)

    return feature_df


##Genrate profile

In [116]:

def generate_profile(df, output_file="results/profile.html"):
    """
    Safe profiling report generator without wordcloud crashes.
    """
    # Drop high-cardinality columns and long text
    drop_cols = [
        col for col in df.columns
        if df[col].nunique() > 500 or df[col].astype(str).str.len().mean() > 50
    ]
    if drop_cols:
        print(f"⚠️ Skipping columns for profiling: {drop_cols}")
        df = df.drop(columns=drop_cols)

    profile = ProfileReport(
        df,
        title="Fraud Detection EDA Report",
        explorative=True,
        plot={"wordcloud": False},                       # disable wordclouds!
        correlations={"cramers": {"calculate": False}},  # speed
        missing_diagrams={"heatmap": False},             # stability
        interactions={"continuous": False}               # skip heavy plots
    )
    profile.to_file(output_file)
    print(f"✅ Profiling report saved → {output_file}")
    return output_file


##Preprocess_features

In [117]:
def preprocess_features(df, numeric_cols, categorical_cols):
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    import pandas as pd

    feature_parts = []

    # Always keep ID
    if "phone_no_m" in df.columns:
        feature_parts.append(df[["phone_no_m"]])

    # Determine valid columns
    valid_numeric = [col for col in numeric_cols if col in df.columns]
    valid_cats = [col for col in categorical_cols if col in df.columns]

    # ✅ Add logging here
    logger.info(
        f"Preprocessing {df.shape[0]} rows | "
        f"numeric: {valid_numeric} | categorical: {valid_cats}"
    )

    # Numeric features
    if valid_numeric:
        scaler = StandardScaler()
        scaled = scaler.fit_transform(df[valid_numeric].fillna(0))
        scaled_df = pd.DataFrame(scaled, columns=valid_numeric)
        feature_parts.append(scaled_df)

    # Categorical features
    if valid_cats:
        cat_data = df[valid_cats].fillna("missing").astype(str)
        encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        encoded = encoder.fit_transform(cat_data)
        encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(valid_cats))
        feature_parts.append(encoded_df)

    # Merge all
    if feature_parts:
        result = pd.concat(feature_parts, axis=1)
    else:
        result = pd.DataFrame()

    return result


#


# Load config +  dataset
# ===============================


In [118]:
config = load_config("configs/baseline.yaml")
# Load raw data
data = load_all_data(config)

# Loop over datasets (APP, SMS, USER, VOC)
for name, df in data.items():
    print(f"\n=== {name.upper()} ===")

    # Generate profile per dataset
    output_file = f"results/{name}_profile.html"
    generate_profile(df, output_file=output_file)
    print(f"📊 Profiling report for {name} saved → {output_file}")

    # Preprocess per dataset
    # Preprocess per dataset
    numeric = config["preprocessing"]["numeric_features"].get(name, [])
    categorical = config["preprocessing"]["categorical_features"].get(name, [])


    if numeric or categorical:
        df_processed = preprocess_features(df.copy(), numeric, categorical)
        print(f"✅ Preprocessed {name}: {df_processed.shape}")
        display(df_processed.head())



=== APP ===
⚠️ Skipping columns for profiling: ['phone_no_m']


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|██████████| 3/3 [00:00<00:00, 42.73it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Profiling report saved → results/app_profile.html
📊 Profiling report for app saved → results/app_profile.html
✅ Preprocessed app: (15, 18)


Unnamed: 0,phone_no_m,flow,busi_name_Apple Siri,busi_name_Jabber,busi_name_QQ,busi_name_QQ阅读,busi_name_Sina_WeiBo_HTTP_Video,busi_name_iMessage,busi_name_missing,busi_name_qq空间,busi_name_天涯社区,busi_name_微信,busi_name_新浪微博,busi_name_旺信,busi_name_融云即时通讯云,busi_name_起点读书,busi_name_钉钉,month_id_2020-04
0,2abe25ed833883fcb81fb9f987596af936a7ef922c8d1d...,-0.089802,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2abe25ed833883fcb81fb9f987596af936a7ef922c8d1d...,-0.288092,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2abe25ed833883fcb81fb9f987596af936a7ef922c8d1d...,-0.459907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,2abe25ed833883fcb81fb9f987596af936a7ef922c8d1d...,2.981736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,2abe25ed833883fcb81fb9f987596af936a7ef922c8d1d...,-0.467958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0



=== SMS ===
⚠️ Skipping columns for profiling: ['phone_no_m', 'opposite_no_m']


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|██████████| 2/2 [00:00<00:00, 59.80it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Profiling report saved → results/sms_profile.html
📊 Profiling report for sms saved → results/sms_profile.html
✅ Preprocessed sms: (16, 2)


Unnamed: 0,phone_no_m,calltype_id_2
0,ce0dfda7499c30385ad400360a96476b2bbb16484e308f...,1.0
1,ce0dfda7499c30385ad400360a96476b2bbb16484e308f...,1.0
2,ce0dfda7499c30385ad400360a96476b2bbb16484e308f...,1.0
3,ce0dfda7499c30385ad400360a96476b2bbb16484e308f...,1.0
4,ce0dfda7499c30385ad400360a96476b2bbb16484e308f...,1.0



=== USER ===
⚠️ Skipping columns for profiling: ['phone_no_m']


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/4 [00:00<?, ?it/s][A
100%|██████████| 4/4 [00:00<00:00, 27.44it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(
  plt.savefig(


Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Profiling report saved → results/user_profile.html
📊 Profiling report for user saved → results/user_profile.html
✅ Preprocessed user: (16, 26)


Unnamed: 0,phone_no_m,arpu_202004,idcard_cnt,city_name_missing,city_name_乐山,city_name_天府新区,city_name_广安,city_name_成都,city_name_绵阳,city_name_自贡,...,county_name_成都直属部门,county_name_武侯分公司,county_name_江油分公司,county_name_游仙分公司,county_name_自井分公司,county_name_荣县分公司,county_name_金牛分公司,county_name_锦江分公司,county_name_青羊分公司,county_name_高新分公司
0,22d522340df77e2252c1a4d92b4bcb00d515e36f3ec6bf...,-0.773601,-0.625543,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,5220d4b8429bdba3971a7b46a088c6b8fa6710f4060759...,-0.914846,-0.625543,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7d19dd2b50ced56f03d23bf928cf34dc570a48525571a8...,-0.773601,-0.625543,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,134a4a591185c9d3788021896dcfc235e9e0a6a1e3f8a4...,1.261897,-0.625543,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2356dcd6759d50455ddaeed03c838843558e9182d5962f...,-0.302784,1.042572,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0



=== VOC ===
⚠️ Skipping columns for profiling: ['phone_no_m', 'opposite_no_m', 'imei_m']


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|██████████| 5/5 [00:00<00:00, 84.72it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Profiling report saved → results/voc_profile.html
📊 Profiling report for voc saved → results/voc_profile.html
✅ Preprocessed voc: (16, 6)


Unnamed: 0,phone_no_m,call_dur,calltype_id_1,calltype_id_2,city_name_missing,county_name_missing
0,b3dce36871f3e88164b18d4953b114163f008cb51c28fe...,2.086007,1.0,0.0,1.0,1.0
1,b3dce36871f3e88164b18d4953b114163f008cb51c28fe...,2.962599,1.0,0.0,1.0,1.0
2,b3dce36871f3e88164b18d4953b114163f008cb51c28fe...,-0.661911,1.0,0.0,1.0,1.0
3,7ec68a368fbbec3279a6a34847f7959623dbff4638351a...,-0.295813,1.0,0.0,1.0,1.0
4,7ec68a368fbbec3279a6a34847f7959623dbff4638351a...,0.087787,0.0,1.0,1.0,1.0
