#**Pre-request**

##Mount google drive


In [1]:
### **Mount** Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Install pakages


In [2]:
#Install pakages
%pip install -q -r /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/Extract_requirements.txt --no-cache-dir



In [3]:
project_path = "/content/drive/MyDrive/Sem-6/coding/github/fraud_detection/"
%cd $project_path
%ls /content/drive/MyDrive/Sem-6/coding/github/fraud_detection

/content/drive/MyDrive/Sem-6/coding/github/fraud_detection
[0m[01;34mconfigs[0m/                       [01;34mfeatures[0m/              requirements.txt   [01;34mtests[0m/
[01;34mdataset[0m/                       [01;34mnotebooks[0m/             [01;34mresults[0m/
Extract_requirements-lock.txt  README.md              run_experiment.py
Extract_requirements.txt       requirements-lock.txt  [01;34msrc[0m/


##Import  libs

In [4]:

import datetime
import os
import pandas as pd
import numpy as np
from scipy.stats import mode
import yaml
import logging
from google.colab import data_table
data_table.enable_dataframe_formatter()



%pip freeze > Extract_requirements-lock.txt



#Basic Methods

##Loging

In [5]:

# Make sure results directory exists
os.makedirs("results", exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("results/data_extract.log")
    ]
)
logger = logging.getLogger(__name__)



##Config

In [6]:
def load_config(config_path="configs/extract_feature.yaml"):
    """Load YAML config file"""
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    logger.info(f"Loaded config from {config_path}")
    return config

## CDR dataset

In [7]:
def load_cdr(file_path, nrows=None):
    """Load a CSV file and safely parse datetime columns."""
    logger.info(f"Loading file: {file_path} (nrows={nrows})")
    df = pd.read_csv(file_path, nrows=nrows)

    # Auto-detect and parse datetime columns
    for col in df.columns:
        if "datetime" in col.lower():
            df[col] = pd.to_datetime(df[col], errors="coerce")

    # Optional: strip extra spaces in headers
    df.columns = df.columns.str.strip()
    return df



def load_all_data(config):
    """
    Load all CSVs defined in config into a dict of DataFrames.
    Uses training.sample_size if available.
    """
    base = config["dataset"]["base_path"]
    files = config["dataset"]["files"]
    sample_size = config.get("training", {}).get("sample_size", None)

    data = {}
    for name, fname in files.items():
        path = os.path.join(base, fname)
        df = load_cdr(path, nrows=sample_size)
        data[name] = df
        logger.info(f"Loaded {name} -> {df.shape} from {path}")
    return data


#Feature Extraction

##Voice

In [8]:
import numpy as np
import pandas as pd
from scipy.stats import mode

def get_voc_feat(df):
    """
    Extract call-based (voice) features per user (phone_no_m).
    Safely handles invalid datetimes, computes duration and interaction statistics.
    """

    df = df.copy()

    # --- Ensure datetime column exists and is clean
    if "start_datetime" not in df.columns:
        raise KeyError("Column 'start_datetime' not found in VOC dataset!")

    df.columns = df.columns.str.strip()

    # --- Convert to datetime safely
    df.loc[:, "start_datetime"] = pd.to_datetime(df["start_datetime"], errors="coerce")

    invalid_dt = df["start_datetime"].isna().sum()
    if invalid_dt > 0:
        print(f"⚠️ Found {invalid_dt} invalid 'start_datetime' entries — dropping them.")
        df = df.dropna(subset=["start_datetime"]).copy()

    # --- Extract hour and day
    df.loc[:, "hour"] = df["start_datetime"].dt.hour
    df.loc[:, "day"] = df["start_datetime"].dt.day

    # --- Base per-user table
    phone_no_m = df[["phone_no_m"]].drop_duplicates().copy()

    # --- Call count & unique contacts
    tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(
        total_calls="count",
        unique_contacts="nunique"
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    # --- Call duration stats
    tmp = df.groupby("phone_no_m")["call_dur"].agg(
        call_dur_mean="mean",
        call_dur_median="median",
        call_dur_max="max",
        call_dur_min="min",
        call_dur_sum="sum"
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    # --- Time preference: most frequent call hour and day
    tmp = df.groupby("phone_no_m")["hour"].agg(
        voc_hour_mode=lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan,
        voc_hour_unique="nunique"
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    tmp = df.groupby("phone_no_m")["day"].agg(
        voc_day_mode=lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan,
        voc_day_unique="nunique"
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    # --- Fill missing safely
    phone_no_m.fillna(0, inplace=True)

    print(f"✅ VOC features extracted for {len(phone_no_m)} users.")
    return phone_no_m


##SMS

In [9]:
def get_sms_feats(df):
    """
    Extract SMS-based behavioral features per user (phone_no_m).
    Handles invalid timestamps safely and avoids SettingWithCopy warnings.
    """

    import numpy as np
    import pandas as pd

    df = df.copy()

    # --- Ensure datetime column exists and convert
    if "request_datetime" not in df.columns:
        raise KeyError("Column 'request_datetime' not found in SMS dataset!")

    # Clean up potential extra spaces or hidden characters in column names
    df.columns = df.columns.str.strip()

    # Convert safely to datetime
    df.loc[:, "request_datetime"] = pd.to_datetime(df["request_datetime"], errors="coerce")

    # --- Validate dtype
    if not np.issubdtype(df["request_datetime"].dtype, np.datetime64):
        print(f"⚠️ request_datetime dtype: {df['request_datetime'].dtype}")
        bad_rows = df[df["request_datetime"].isna()].shape[0]
        print(f"⚠️ Found {bad_rows} invalid datetime entries — dropping them.")
        df = df.dropna(subset=["request_datetime"]).copy()

        # Try conversion again just in case
        df.loc[:, "request_datetime"] = pd.to_datetime(df["request_datetime"], errors="coerce")

    # --- Now extract hour/day safely
    df.loc[:, "hour"] = df["request_datetime"].dt.hour
    df.loc[:, "day"] = df["request_datetime"].dt.day

    # --- Base unique user set
    phone_no_m = df[["phone_no_m"]].drop_duplicates().copy()

    # --- Message counts & unique contacts
    tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(
        sms_count="count",
        sms_unique_contacts="nunique"
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    # --- Hourly and daily messaging patterns
    tmp = df.groupby("phone_no_m")["hour"].agg(
        sms_hour_mode=lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan,
        sms_hour_unique="nunique"
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    tmp = df.groupby("phone_no_m")["day"].agg(
        sms_day_mode=lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan,
        sms_day_unique="nunique"
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    phone_no_m.fillna(0, inplace=True)

    print(f"✅ SMS features extracted for {len(phone_no_m)} users.")
    return phone_no_m


##app

In [10]:
def get_app_feats(df):
    phone_no_m = df[["phone_no_m"]].drop_duplicates().copy()
    tmp = df.groupby("phone_no_m")["busi_name"].agg(unique_apps="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")

    tmp = df.groupby("phone_no_m")["flow"].agg(
        flow_mean="mean",
        flow_sum="sum"
    )
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    return phone_no_m

##User

In [11]:
def get_user_feats(df):
    phone_no_m = df[["phone_no_m", "label"]].drop_duplicates().copy()

    arpu_cols = [c for c in df.columns if c.startswith("arpu_")]
    df["arpu_mean"] = df[arpu_cols].mean(axis=1)
    df["arpu_var"] = df[arpu_cols].var(axis=1)
    df["arpu_max"] = df[arpu_cols].max(axis=1)
    df["arpu_min"] = df[arpu_cols].min(axis=1)

    phone_no_m = phone_no_m.merge(
        df[["phone_no_m", "arpu_mean", "arpu_var", "arpu_max", "arpu_min"]],
        on="phone_no_m",
        how="left"
    )
    return phone_no_m

#Build Aggregation

In [12]:
def aggregate_features(df, mode="full", window=10, time_col=None, extractor=None):
    """
    General feature aggregation wrapper.
    mode: 'full', 'n_events', or 'timely'
    window: number of events or time window (in days)
    extractor: the feature function (get_voc_feat, get_sms_feats, etc.)
    """
    if mode == "full":
        return extractor(df)

    snapshots = []

    # Sort for time-based order
    if time_col:
        df = df.sort_values(by=["phone_no_m", time_col])

    for phone, group in df.groupby("phone_no_m"):
        if mode == "n_events":
            for i in range(window, len(group)+1):
                subset = group.iloc[i-window:i]
                feat = extractor(subset)
                feat["snapshot_index"] = i
                feat["phone_no_m"] = phone
                snapshots.append(feat)

        elif mode == "timely":
            if not time_col:
                raise ValueError("time_col must be provided for timely mode")
            group["time_diff"] = group[time_col] - group[time_col].min()
            for end_time in pd.date_range(
                start=group[time_col].min() + pd.Timedelta(days=window),
                end=group[time_col].max(),
                freq=f"{window}D"
            ):
                subset = group[group[time_col] <= end_time]
                feat = extractor(subset)
                feat["snapshot_time"] = end_time
                feat["phone_no_m"] = phone
                snapshots.append(feat)

    return pd.concat(snapshots, ignore_index=True)


#Extract Features

In [13]:
def extract_all_features(data, config, mode="full", window=10):
    """Extract and aggregate all feature types."""
    logger.info(f"Starting feature extraction: mode={mode}, window={window}")

    results = []

    # Voice
    if "voc" in data:
        logger.info("Extracting VOC features...")
        voc_feat = aggregate_features(
            data["voc"],
            mode=mode,
            window=window,
            time_col="start_datetime",
            extractor=get_voc_feat
        )
        results.append(voc_feat)

    # SMS
    if "sms" in data:
        logger.info("Extracting SMS features...")
        sms_feat = aggregate_features(
            data["sms"],
            mode=mode,
            window=window,
            time_col="request_datetime",
            extractor=get_sms_feats
        )
        results.append(sms_feat)

    # App
    if "app" in data:
        logger.info("Extracting APP features...")
        app_feat = aggregate_features(
            data["app"],
            mode=mode,
            window=window,
            extractor=get_app_feats
        )
        results.append(app_feat)

    # User
    if "user" in data:
        logger.info("Extracting USER features...")
        user_feat = aggregate_features(
            data["user"],
            mode=mode,
            window=window,
            extractor=get_user_feats
        )
        results.append(user_feat)

    # Merge all by phone_no_m
    #logger.info("Merging all feature sets...")
    #final_df = results[0]
    #for df in results[1:]:
      #  final_df = final_df.merge(df, on="phone_no_m", how="left")

    return  results



#Save new features

#Load

In [14]:
config = load_config("configs/extract_feature.yaml")

# Load raw data
data = load_all_data(config)




#Run Extract

In [15]:
# Extract features
config_mode = config["aggregation"]["mode"]
config_window = config["aggregation"]["window"]
print("📅 SMS datetime dtype:", data["sms"]["request_datetime"].dtype)
print("📅 VOC datetime dtype:", data["voc"]["start_datetime"].dtype)


features = extract_all_features(
    data,
    config,
    mode=config["aggregation"]["mode"],
    window=config["aggregation"]["window"]
)

# Create output directories if they don't exist
features_dir = config["features"]["features_dir"]
os.makedirs(features_dir, exist_ok=True)

results_dir = config["output"].get("results_dir", features_dir)
os.makedirs(results_dir, exist_ok=True)

# Add timestamp to filenames
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Save feature files
for name, df in zip(["voc", "sms", "app", "user"], features):
    output_filename = f"{name}_features_{config_mode}_{config_window}_{timestamp}.csv"
    output_path = os.path.join(features_dir, output_filename)
    df.to_csv(output_path, index=False)
    display(df.head())
    logger.info(f"✅ Saved {name} features → {output_path}")

📅 SMS datetime dtype: datetime64[ns]
📅 VOC datetime dtype: datetime64[ns]
✅ VOC features extracted for 165 users.
✅ SMS features extracted for 495 users.


Unnamed: 0,phone_no_m,total_calls,unique_contacts,call_dur_mean,call_dur_median,call_dur_max,call_dur_min,call_dur_sum,voc_hour_mode,voc_hour_unique,voc_day_mode,voc_day_unique
0,3b6bef554a4754fa9977b50d99e6777c9fb26e585961a4...,8,6,-0.290461,-0.284513,-0.184069,-0.400818,-2.323687,17,7,19,4
1,ecd1b3f5a38576ef9872d1cb7a762f31869153de736e7a...,7,7,-0.178783,-0.226361,0.29172,-0.369098,-1.251478,13,4,28,3
2,41a97bdfe77f3210deba78689baecb110922878be2f14c...,4,4,-0.255437,-0.284513,-0.125917,-0.326806,-1.02175,16,3,18,3
3,f9aa95c4d88f05adad8e9d1bc10f140f922c66fe533c81...,6,2,-0.154112,-0.168209,-0.025473,-0.2898,-0.924672,15,5,12,3
4,be5f4f7a1a71352817c7432390b8ad51a45e09dc1efbb1...,5,3,-0.307774,-0.363812,-0.088911,-0.416677,-1.538872,7,5,28,4


Unnamed: 0,phone_no_m,sms_count,sms_unique_contacts,sms_hour_mode,sms_hour_unique,sms_day_mode,sms_day_unique
0,0251387744988114430181c3e680a3733001a26fe686bd...,35,5,8,8,9,9
1,296cfae1d838070c4dd05a125a85c3d29bbb95f713c2ea...,1,1,17,1,1,1
2,39de6ef3a87b8e660e42496450c54b731f3621ca708944...,25,9,14,10,1,1
3,7d82b7b2a55085cbda35f38e2737cf174703f7cba65835...,9,6,14,5,1,1
4,be23b76c58dac6eaaaa1f997c7320db5112eaa2d266aeb...,1,1,9,1,1,1


Unnamed: 0,phone_no_m,unique_apps,flow_mean,flow_sum
0,416cec0f25b93f08bfd9cff44382c5da3a6346beb16a2c...,1,0.0,0.0
1,26fcb7c6f4125ee5445756d4ff5346c29f2aff7d0f2e31...,1,0.0,0.0
2,1a58c82eeefdb00ea6abf2e6010a8e808b27e1802b764d...,12,9.280285,111.363424
3,747fb64152de320b22ee724cafcef121b3dfefb3b6e153...,14,4.780563,66.927876
4,650aa8b1d78ec10578f04aacbea64943279e65f10e608d...,15,0.043621,0.654315


Unnamed: 0,phone_no_m,label,arpu_mean,arpu_var,arpu_max,arpu_min
0,672ddbf02a5544d32e4ecc9433b1981bffe23bf912273a...,0,-0.044756,0.005198,0.059162,-0.145352
1,5e1272273e041e82cb275ae877710be98cdaf5b0a8f34d...,0,0.212189,0.114297,0.613593,-0.218675
2,eaab3472ec87b076e69e6e8bb62b14341638fc63661a6c...,0,-0.256875,0.039926,0.06427,-0.396704
3,0ce1bb415704178bf44e9c9b431a39b083a132c8e6d99f...,0,0.091713,0.568479,1.350638,-0.424354
4,28b87f35f63f65096a53e3a4c97eaffd4a6c43ffa7e92d...,0,0.013576,0.006913,0.125172,-0.099927
