#**Pre-request**

##Mount google drive


In [16]:
### **Mount** Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Install pakages


In [17]:
#Install pakages
%pip install -q -r /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/Extract_requirements.txt --no-cache-dir



In [18]:
project_path = "/content/drive/MyDrive/Sem-6/coding/github/fraud_detection/"
%cd $project_path
%ls /content/drive/MyDrive/Sem-6/coding/github/fraud_detection

/content/drive/MyDrive/Sem-6/coding/github/fraud_detection
[0m[01;34mconfigs[0m/                       [01;34mfeatures[0m/              requirements.txt   [01;34mtests[0m/
[01;34mdataset[0m/                       [01;34mnotebooks[0m/             [01;34mresults[0m/
Extract_requirements-lock.txt  README.md              run_experiment.py
Extract_requirements.txt       requirements-lock.txt  [01;34msrc[0m/


##Import  libs

In [19]:

import datetime
import os
import pandas as pd
import numpy as np
from scipy.stats import mode
import yaml
import logging
from tqdm import tqdm

from google.colab import data_table
data_table.enable_dataframe_formatter()



%pip freeze > Extract_requirements-lock.txt



#Basic Methods

##Loging

In [20]:

# Make sure results directory exists
os.makedirs("results", exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("results/data_extract.log")
    ]
)
logger = logging.getLogger(__name__)



##Config

In [21]:
def load_config(config_path="configs/extract_feature.yaml"):
    """Load YAML config file"""
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    logger.info(f"Loaded config from {config_path}")
    return config

## CDR dataset

In [22]:
def load_cdr(file_path, nrows=None):
    """Load a CSV file and safely parse datetime columns."""
    logger.info(f"Loading file: {file_path} (nrows={nrows})")
    df = pd.read_csv(file_path, nrows=nrows)

    # Auto-detect and parse datetime columns
    for col in df.columns:
        if "datetime" in col.lower():
            df[col] = pd.to_datetime(df[col], errors="coerce")

    # Optional: strip extra spaces in headers
    df.columns = df.columns.str.strip()
    return df



def load_all_data(config):
    """
    Load all CSVs defined in config into a dict of DataFrames.
    Uses training.sample_size if available.
    """
    base = config["dataset"]["base_path"]
    files = config["dataset"]["files"]
    sample_size = config.get("training", {}).get("sample_size", None)

    data = {}
    for name, fname in files.items():
        path = os.path.join(base, fname)
        df = load_cdr(path, nrows=sample_size)
        data[name] = df
        logger.info(f"Loaded {name} -> {df.shape} from {path}")
    return data


#Feature Extraction

##Voice

In [23]:
def get_voc_feat(df):
    """
    Extract voice (VOC) features per user (for rolling or full aggregation).
    Expects: phone_no_m, opposite_no_m, start_datetime, call_dur, calltype_id.
    """

    df = df.copy()

    # --- Handle datetime
    if "start_datetime" in df.columns:
        df["start_datetime"] = pd.to_datetime(df["start_datetime"], errors="coerce")
        df = df.dropna(subset=["start_datetime"]).copy()
        df["hour"] = df["start_datetime"].dt.hour
        df["day"] = df["start_datetime"].dt.day
    else:
        raise ValueError("❌ Missing 'start_datetime' column in VOC data")

    phone_no_m = df[["phone_no_m"]].drop_duplicates().copy()

    # --- Call volume & contact diversity
    tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(
        total_calls="count",
        unique_contacts="nunique"
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    # --- Duration statistics
    if "call_dur" in df.columns:
        tmp = df.groupby("phone_no_m")["call_dur"].agg(
            call_dur_mean="mean",
            call_dur_median="median",
            call_dur_max="max",
            call_dur_min="min",
            call_dur_sum="sum",
            call_dur_std="std"
        )
        phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    # --- Time preference
    tmp = df.groupby("phone_no_m")["hour"].agg(
        voc_hour_mode=lambda x: mode(x, keepdims=True)[0][0] if len(x) > 0 else np.nan,
        voc_active_hours="nunique"
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    tmp = df.groupby("phone_no_m")["day"].agg(
        voc_day_mode=lambda x: mode(x, keepdims=True)[0][0] if len(x) > 0 else np.nan,
        voc_active_days="nunique"
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    # --- Call direction balance (optional)
    if "calltype_id" in df.columns:
        # Assuming 1=outgoing, 2=incoming
        tmp = df.groupby(["phone_no_m", "calltype_id"]).size().unstack(fill_value=0)
        out_col = tmp[1] if 1 in tmp.columns else pd.Series(0, index=tmp.index)
        in_col  = tmp[2] if 2 in tmp.columns else pd.Series(0, index=tmp.index)
        tmp["call_out_in_ratio"] = out_col.astype(float) / (in_col.astype(float) + 1e-5)
        phone_no_m = phone_no_m.merge(tmp[["call_out_in_ratio"]], on="phone_no_m", how="left")

    return phone_no_m


##SMS

In [24]:
def get_sms_feats(df):
    """
    Extract SMS features per user (for rolling or full aggregation).
    Expects: phone_no_m, opposite_no_m, request_datetime.
    """

    df = df.copy()

    # --- Handle datetime safely
    if "request_datetime" in df.columns:
        df["request_datetime"] = pd.to_datetime(df["request_datetime"], errors="coerce")
        df = df.dropna(subset=["request_datetime"]).copy()
        df["hour"] = df["request_datetime"].dt.hour
        df["day"] = df["request_datetime"].dt.day
    else:
        raise ValueError("❌ Missing 'request_datetime' column in SMS data")

    # --- Base aggregation
    phone_no_m = df[["phone_no_m"]].drop_duplicates().copy()

    tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(
        sms_total="count",
        sms_unique_contacts="nunique"
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    # --- Time behavior features
    tmp = df.groupby("phone_no_m")["hour"].agg(
        sms_active_hours="nunique",
        sms_peak_hour=lambda x: mode(x, keepdims=True)[0][0] if len(x) > 0 else np.nan
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    tmp = df.groupby("phone_no_m")["day"].agg(
        sms_active_days="nunique",
        sms_peak_day=lambda x: mode(x, keepdims=True)[0][0] if len(x) > 0 else np.nan
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    # --- Send/receive ratio if available
    if "calltype_id" in df.columns:
        # Typically 1 = send, 2 = receive (check dataset)
        tmp = df.groupby(["phone_no_m", "calltype_id"]).size().unstack(fill_value=0)
        send_col = tmp[1] if 1 in tmp.columns else pd.Series(0, index=tmp.index)
        recv_col = tmp[2] if 2 in tmp.columns else pd.Series(0, index=tmp.index)
        tmp["sms_send_recv_ratio"] = send_col.astype(float) / (recv_col.astype(float) + 1e-5)

        phone_no_m = phone_no_m.merge(tmp[["sms_send_recv_ratio"]], on="phone_no_m", how="left")

    return phone_no_m


##app

In [25]:
def get_app_feats(df):
    """Extract per-user application usage features (aggregated monthly)."""
    df = df.copy()

    # --- Step 1: Ensure month_id is parsed as datetime ---
    if "month_id" not in df.columns:
        raise ValueError("❌ APP dataset must contain a 'month_id' column.")

    df["month_id"] = pd.to_datetime(df["month_id"], errors="coerce")
    df = df.dropna(subset=["month_id"])

    # --- Step 2: Basic cleaning ---
    df["flow"] = pd.to_numeric(df["flow"], errors="coerce").fillna(0)

    # --- Step 3: Aggregate per (user, month) ---
    monthly = (
        df.groupby(["phone_no_m", "month_id"])
        .agg(
            total_flow=("flow", "sum"),
            unique_apps=("busi_name", "nunique"),
        )
        .reset_index()
    )

    # --- Step 4: Aggregate across all months per user ---
    features = (
        monthly.groupby("phone_no_m")
        .agg(
            app_months_active=("month_id", "nunique"),
            app_total_flow=("total_flow", "sum"),
            app_avg_flow=("total_flow", "mean"),
            app_std_flow=("total_flow", "std"),
            app_unique_apps_mean=("unique_apps", "mean"),
            app_unique_apps_max=("unique_apps", "max"),
        )
        .reset_index()
    )

    # --- Step 5: Clean up ---
    features = features.fillna(0)
    return features


##User

In [26]:
def get_user_feats(df):
    """Extract per-user static and ARPU trend features (wide monthly format)."""

    df = df.copy()

    # Identify ARPU columns dynamically
    arpu_cols = [c for c in df.columns if c.startswith("arpu_")]
    if not arpu_cols:
        raise ValueError("❌ No ARPU columns found in USER dataset.")

    # Convert ARPU columns to float
    df[arpu_cols] = df[arpu_cols].apply(pd.to_numeric, errors="coerce")

    # --- Basic statistics ---
    df["arpu_mean"] = df[arpu_cols].mean(axis=1)
    df["arpu_std"] = df[arpu_cols].std(axis=1)
    df["arpu_min"] = df[arpu_cols].min(axis=1)
    df["arpu_max"] = df[arpu_cols].max(axis=1)
    df["arpu_range"] = df["arpu_max"] - df["arpu_min"]

    # --- Trend: approximate monthly slope ---
    arpu_values = df[arpu_cols].values
    months = np.arange(len(arpu_cols))
    slopes = []
    for row in arpu_values:
        if np.all(np.isnan(row)):
            slopes.append(np.nan)
        else:
            valid = ~np.isnan(row)
            if valid.sum() < 2:
                slopes.append(0)
            else:
                coeff = np.polyfit(months[valid], row[valid], 1)
                slopes.append(coeff[0])
    df["arpu_trend"] = slopes

    # --- Keep only relevant columns ---
    keep_cols = ["phone_no_m", "city_name", "county_name", "idcard_cnt", "label",
                 "arpu_mean", "arpu_std", "arpu_min", "arpu_max", "arpu_range", "arpu_trend"]

    return df[keep_cols]


#Build Aggregation

In [27]:
def aggregate_features(df, mode="full", window=10, time_col=None, extractor=None):
    """
    Feature aggregation per user.

    Supports:
      - full     → aggregate all data for each user once
      - n_events → per-user rolling window (last `window` events)
      - timely   → per-user rolling window (last `window` days)
    """

    logger.info(f"Aggregating with mode={mode}, window={window}, time_col={time_col}")

    if mode == "full":
        return extractor(df)

    snapshots = []

    if time_col:
        df = df.sort_values(by=["phone_no_m", time_col])
        for phone, group in tqdm(df.groupby("phone_no_m"), desc=f"{mode} aggregation"):
            group = group.sort_values(time_col)

            if mode == "n_events":
                subset = group.iloc[-window:]
                if subset.empty:
                    continue
                feat = extractor(subset)
                if "phone_no_m" not in feat.columns:
                    feat["phone_no_m"] = phone
                snapshots.append(feat)

            elif mode == "timely":
                cutoff = group[time_col].max() - pd.Timedelta(days=window)
                subset = group[group[time_col] >= cutoff]
                if subset.empty:
                    continue
                feat = extractor(subset)
                if "phone_no_m" not in feat.columns:
                    feat["phone_no_m"] = phone
                snapshots.append(feat)

    else:
        # Fallback for datasets without time column (e.g., user, app)
        logger.info("No time column provided — running full aggregation only once.")
        return extractor(df)

    if not snapshots:
        logger.warning(f"No snapshots generated for mode={mode}, window={window}. Returning empty DataFrame.")
        return pd.DataFrame(columns=["phone_no_m"])

    return pd.concat(snapshots, ignore_index=True)


#Extract Features

In [28]:
def extract_all_features(data, config, mode="full", window=10):
    """Extract and aggregate all feature types."""
    logger.info(f"🚀 Starting feature extraction: mode={mode}, window={window}")

    results = []

    # -------------------------
    # Voice (VOC) - event-level, time-aware
    # -------------------------
    if "voc" in data and not data["voc"].empty:
        logger.info("📞 Extracting VOC features...")
        voc_feat = aggregate_features(
            df=data["voc"],
            mode=mode,
            window=window,
            time_col="start_datetime",
            extractor=get_voc_feat
        )
        results.append(voc_feat)
    else:
        logger.warning("⚠️ VOC data not found or empty.")

    # -------------------------
    # SMS - event-level, time-aware
    # -------------------------
    if "sms" in data and not data["sms"].empty:
        logger.info("💬 Extracting SMS features...")
        sms_feat = aggregate_features(
            df=data["sms"],
            mode=mode,
            window=window,
            time_col="request_datetime",
            extractor=get_sms_feats
        )
        results.append(sms_feat)
    else:
        logger.warning("⚠️ SMS data not found or empty.")

    # -------------------------
    # App - monthly contextual
    # -------------------------
    if "app" in data and not data["app"].empty:
        logger.info("📱 Extracting APP (monthly) features...")
        app_feat = aggregate_features(
            df=data["app"],
            mode="full",                # Always static aggregation
            window=window,
            time_col=None,              # No time column in app
            extractor=get_app_feats
        )
        results.append(app_feat)
    else:
        logger.warning("⚠️ APP data not found or empty.")

    # -------------------------
    # User - static (fees, trends)
    # -------------------------
    if "user" in data and not data["user"].empty:
        logger.info("👤 Extracting USER (static/trend) features...")
        user_feat = aggregate_features(
            df=data["user"],
            mode="full",                # Always static aggregation
            window=window,
            time_col=None,              # No time column in user
            extractor=get_user_feats
        )
        results.append(user_feat)
    else:
        logger.warning("⚠️ USER data not found or empty.")

    # -------------------------
    # Return all feature DataFrames (voc, sms, app, user)
    # -------------------------
    if not results:
        logger.warning("❌ No feature sets generated — returning empty list.")
        return []

    logger.info(f"✅ Feature extraction complete — {len(results)} feature sets created.")
    return results


#Load

In [29]:
config = load_config("configs/extract_feature.yaml")

# Load raw data
data = load_all_data(config)




#Run Extract

In [30]:
# Extract features
config_mode = config["aggregation"]["mode"]
config_window = config["aggregation"]["window"]
features = extract_all_features(
    data,
    config,
    mode=config["aggregation"]["mode"],
    window=config["aggregation"]["window"]
)

# Create output directories if they don't exist
features_dir = config["features"]["features_dir"]
os.makedirs(features_dir, exist_ok=True)

results_dir = config["output"].get("results_dir", features_dir)
os.makedirs(results_dir, exist_ok=True)

# Add timestamp to filenames
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Save feature files
for name, df in zip(["voc", "sms", "app", "user"], features):
    output_filename = f"{name}_features_{config_mode}_{config_window}_{timestamp}.csv"
    output_path = os.path.join(features_dir, output_filename)
    df.to_csv(output_path, index=False)
    display(name)
    display(df.head())
    logger.info(f"✅ Saved {name} features → {output_path}")

'voc'

Unnamed: 0,phone_no_m,total_calls,unique_contacts,call_dur_mean,call_dur_median,call_dur_max,call_dur_min,call_dur_sum,call_dur_std,voc_hour_mode,voc_active_hours,voc_day_mode,voc_active_days,call_out_in_ratio
0,3b6bef554a4754fa9977b50d99e6777c9fb26e585961a4...,2,1,-0.450978,-0.450978,-0.443726,-0.45823,-0.901956,0.010256,17,1,17,1,0.0
1,ecd1b3f5a38576ef9872d1cb7a762f31869153de736e7a...,2,2,-0.409881,-0.409881,-0.385707,-0.434056,-0.819763,0.034188,9,2,27,2,200000.0
2,41a97bdfe77f3210deba78689baecb110922878be2f14c...,2,2,-0.38329,-0.38329,-0.356698,-0.409881,-0.766579,0.037607,16,1,18,1,0.99999
3,f9aa95c4d88f05adad8e9d1bc10f140f922c66fe533c81...,2,1,-0.281757,-0.281757,-0.255165,-0.308349,-0.563514,0.037607,9,2,12,2,0.0
4,be5f4f7a1a71352817c7432390b8ad51a45e09dc1efbb1...,1,1,-0.443726,-0.443726,-0.443726,-0.443726,-0.443726,,9,1,7,1,0.0


'sms'

Unnamed: 0,phone_no_m,sms_total,sms_unique_contacts,sms_active_hours,sms_peak_hour,sms_active_days,sms_peak_day,sms_send_recv_ratio
0,0251387744988114430181c3e680a3733001a26fe686bd...,3,1,1,12,1,1,0.0
1,296cfae1d838070c4dd05a125a85c3d29bbb95f713c2ea...,1,1,1,17,1,1,0.0
2,39de6ef3a87b8e660e42496450c54b731f3621ca708944...,25,9,10,14,1,1,0.086956
3,7d82b7b2a55085cbda35f38e2737cf174703f7cba65835...,9,6,5,14,1,1,0.0
4,be23b76c58dac6eaaaa1f997c7320db5112eaa2d266aeb...,1,1,1,9,1,1,0.0


'app'

Unnamed: 0,phone_no_m,app_months_active,app_total_flow,app_avg_flow,app_std_flow,app_unique_apps_mean,app_unique_apps_max
0,1a4a0f2b5ff1e911e21c8c079dcb85f8cdf721906dace6...,1,0.12125,0.12125,0.0,4.0,4
1,1a58c82eeefdb00ea6abf2e6010a8e808b27e1802b764d...,1,111.363424,111.363424,0.0,12.0,12
2,26fcb7c6f4125ee5445756d4ff5346c29f2aff7d0f2e31...,1,0.0,0.0,0.0,1.0,1
3,416cec0f25b93f08bfd9cff44382c5da3a6346beb16a2c...,1,0.0,0.0,0.0,1.0,1
4,5ca53dec0b2a9a35fdf8cca6d2ff78d51fe8df11b453ad...,1,0.404959,0.404959,0.0,11.0,11


'user'

Unnamed: 0,phone_no_m,city_name,county_name,idcard_cnt,label,arpu_mean,arpu_std,arpu_min,arpu_max,arpu_range,arpu_trend
0,672ddbf02a5544d32e4ecc9433b1981bffe23bf912273a...,绵阳,江油分公司,1,0,-0.081159,0.066998,-0.165342,0.025044,0.190387,-0.020405
1,5e1272273e041e82cb275ae877710be98cdaf5b0a8f34d...,德阳,旌阳分公司,1,0,0.204525,0.31204,-0.232995,0.547599,0.780594,-0.094934
2,eaab3472ec87b076e69e6e8bb62b14341638fc63661a6c...,成都,金堂分公司,2,0,-0.258075,0.241283,-0.466204,0.150882,0.617086,0.072254
3,0ce1bb415704178bf44e9c9b431a39b083a132c8e6d99f...,成都,高新分公司,2,0,0.072068,0.860228,-0.445708,1.686543,2.132251,-0.095681
4,28b87f35f63f65096a53e3a4c97eaffd4a6c43ffa7e92d...,德阳,旌阳分公司,1,0,-0.016199,0.073172,-0.120964,0.087548,0.208512,-0.021447
