#**Pre-request**

##Mount google drive


In [1]:
### **Mount** Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##Install pakages


In [2]:
project_path = "/content/drive/MyDrive/Sem-6/coding/github/fraud_detection/"
%cd $project_path
%ls $project_path

/content/drive/MyDrive/Sem-6/coding/github/fraud_detection
[0m[01;34mconfigs[0m/  [01;34mnotebooks[0m/  [01;34mrequirement[0m/  run_experiment.py  [01;34msrc[0m/
[01;34mdataset[0m/  README.md   [01;34mresults[0m/      [01;34msplits[0m/            [01;34mtests[0m/


##Import  libs

In [3]:

import datetime
import os
import pandas as pd
import numpy as np
from scipy.stats import mode
import yaml
import logging
from tqdm import tqdm
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import altair as alt
from google.colab import data_table
data_table.enable_dataframe_formatter()
import seaborn as sns
import matplotlib.pyplot as plt







#Basic Methods

##Config

In [4]:

def load_config(config_path="configs/baseline.yaml"):
    """Load YAML config file and expand ${root_path} placeholders."""
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)


    # --- Expand ${root_path} placeholders ---
    root = config.get("root_path", "")

    def expand_paths(obj):
        if isinstance(obj, dict):
            return {k: expand_paths(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [expand_paths(i) for i in obj]
        elif isinstance(obj, str) and "${root_path}" in obj:
            return obj.replace("${root_path}", root)
        else:
            return obj

    config = expand_paths(config)
    return config

## CDR dataset

In [5]:
def load_cdr(file_path, nrows=None):
    """Load a CSV file and safely parse datetime columns."""
    df = pd.read_csv(file_path, nrows=nrows)

    # Auto-detect and parse datetime columns
    for col in df.columns:
        if "datetime" in col.lower():
            df[col] = pd.to_datetime(df[col], errors="coerce")

    # Optional: strip extra spaces in headers
    df.columns = df.columns.str.strip()
    return df



def load_all_data(config):
    """
    Load all CSVs defined in config into a dict of DataFrames.
    Uses sample_data.size if available.
    """
    base = config["dataset"]["base_path"]
    files = config["dataset"]["files"]
    sample_size = config.get("sample_data", {}).get("size", None)

    data = {}
    for name, fname in files.items():
        path = os.path.join(base, fname)
        df = load_cdr(path, nrows=sample_size)
        data[name] = df
    return data


##Create  sample

In [6]:
def create_balanced_sample(config, random_state=42):
    """
    Create a balanced sample dataset across app, sms, voc, user files.
    Selects N fraud users and N legit users, and extracts *all their records*
    from every dataset.
    """

    # âœ… Read from 'sample_data' section instead of 'dataset'
    sample_cfg = config["sample_data"]
    base_path = sample_cfg["full_data"]
    files = sample_cfg["files"]
    sample_size = sample_cfg.get("size", 100)
    save_path = sample_cfg.get(
        "save_path",
        os.path.join(base_path, "sampled")
    )

    # --- Load user dataset first (must contain label)
    user_path = os.path.join(base_path, files["user"])
    user_df = pd.read_csv(user_path)
    user_df = user_df.dropna(subset=["label"])
    user_df["label"] = user_df["label"].astype(int)

    # Count per class
    class_counts = user_df["label"].value_counts()
    print("Class counts:\n", class_counts)

    # Percentage per class
    class_percent = user_df["label"].value_counts(normalize=True) * 100
    print("\nClass percentage (%):\n", class_percent)


    fraud_users = user_df[user_df["label"] == 1]
    legit_users = user_df[user_df["label"] == 0]

    # --- Pick N from each class (balanced)
    total_users = sample_cfg.get("size", 100)
    fraud_ratio = sample_cfg.get("fraud_ratio", 0.5)

    n_fraud = int(total_users * fraud_ratio)
    n_legit = total_users - n_fraud

    n_fraud = min(n_fraud, len(fraud_users))
    n_legit = min(n_legit, len(legit_users))

    fraud_sample = fraud_users.sample(n_fraud, random_state=random_state)
    legit_sample = legit_users.sample(n_legit, random_state=random_state)

    selected_users = pd.concat([fraud_sample, legit_sample])
    selected_user_ids = set(selected_users["phone_no_m"])

    print(f"âœ… Selected {n_fraud} fraud + {n_legit} legit users â†’ {len(selected_user_ids)} total users.")




    sampled_data = {"user": selected_users.copy()}

    # --- Load and filter other datasets by selected users
    for key in ["app", "sms", "voc"]:
        path = os.path.join(base_path, files[key])
        df = pd.read_csv(path)
        before = len(df)
        df = df[df["phone_no_m"].isin(selected_user_ids)]
        after = len(df)
        pct = after / before * 100 if before > 0 else 0
        print(f"ðŸ“‚ {key.upper()} â†’ {after}/{before} rows kept ({pct:.1f}%)")
        sampled_data[key] = df.copy()

    # --- Save to configured path in `sample_train_<name>.csv` format
    os.makedirs(save_path, exist_ok=True)

    for name, df in sampled_data.items():
        output_filename = f"sample_train_{name}.csv"
        output_path = os.path.join(save_path, output_filename)
        df.to_csv(output_path, index=False)
        print(f"ðŸ’¾ Saved {output_filename} â†’ {len(df)} rows at {output_path}")

    return sampled_data


#Run

In [7]:
config = load_config("configs/baseline.yaml")

# Extract paths from config
sample_cfg = config["sample_data"]
full_data_path = sample_cfg["full_data"]
save_path = sample_cfg["save_path"]

# Create directories if needed
os.makedirs(save_path, exist_ok=True)

# Run the sampling process
sampled_data = create_balanced_sample(config)

# Save outputs
for name, df in sampled_data.items():
    out_file = os.path.join(save_path, f"sample_train_{name}.csv")
    df.to_csv(out_file, index=False)
    print(f"ðŸ’¾ Saved {out_file} â†’ {len(df)} rows ({df['phone_no_m'].nunique()} users)")


Class counts:
 label
0    4144
1    1962
Name: count, dtype: int64

Class percentage (%):
 label
0    67.867671
1    32.132329
Name: proportion, dtype: float64
âœ… Selected 300 fraud + 700 legit users â†’ 1000 total users.
ðŸ“‚ APP â†’ 557010/3283602 rows kept (17.0%)
ðŸ“‚ SMS â†’ 1169295/6848509 rows kept (17.1%)


  df = pd.read_csv(path)


ðŸ“‚ VOC â†’ 815751/5015430 rows kept (16.3%)
ðŸ’¾ Saved sample_train_user.csv â†’ 1000 rows at /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/dataset/CallChinses/raw/sample/sampled/sample_train_user.csv
ðŸ’¾ Saved sample_train_app.csv â†’ 557010 rows at /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/dataset/CallChinses/raw/sample/sampled/sample_train_app.csv
ðŸ’¾ Saved sample_train_sms.csv â†’ 1169295 rows at /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/dataset/CallChinses/raw/sample/sampled/sample_train_sms.csv
ðŸ’¾ Saved sample_train_voc.csv â†’ 815751 rows at /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/dataset/CallChinses/raw/sample/sampled/sample_train_voc.csv
ðŸ’¾ Saved /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/dataset/CallChinses/raw/sample/sampled/sample_train_user.csv â†’ 1000 rows (1000 users)
ðŸ’¾ Saved /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/dataset/CallChinses/raw/sample/sampled/sample

#freeze

In [8]:
%pip freeze > "{project_path}requirement/freez/sample_extract_requirements-lock.txt"
