#**Pre-request**

##Mount google drive


In [17]:
### **Mount** Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Install pakages


In [18]:
project_path = "/content/drive/MyDrive/Sem-6/coding/github/fraud_detection/"
%cd $project_path
%ls /content/drive/MyDrive/Sem-6/coding/github/fraud_detection

/content/drive/MyDrive/Sem-6/coding/github/fraud_detection
[0m[01;34mconfigs[0m/                       requirements-lock.txt
[01;34mdataset[0m/                       requirements.txt
Extract_requirements-lock.txt  [01;34mresults[0m/
Extract_requirements.txt       run_experiment.py
[01;34mfeatures[0m/                      sample_extract_requirements-lock.txt
[01;34mnotebooks[0m/                     [01;34msrc[0m/
README.md                      [01;34mtests[0m/


##Import  libs

In [19]:

import datetime
import os
import pandas as pd
import numpy as np
from scipy.stats import mode
import yaml
import logging
from tqdm import tqdm
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import altair as alt
from google.colab import data_table
data_table.enable_dataframe_formatter()
import seaborn as sns
import matplotlib.pyplot as plt




%pip freeze > sample_extract_requirements-lock.txt



#Basic Methods

##Loging

In [20]:

# Make sure results directory exists
os.makedirs("results", exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("results/sample_data_extract.log")
    ]
)
logger = logging.getLogger(__name__)



##Config

In [21]:
def load_config(config_path="configs/baseline.yaml"):
    """Load YAML config file"""
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    logger.info(f"Loaded config from {config_path}")
    return config

## CDR dataset

In [22]:
def load_cdr(file_path, nrows=None):
    """Load a CSV file and safely parse datetime columns."""
    logger.info(f"Loading file: {file_path} (nrows={nrows})")
    df = pd.read_csv(file_path, nrows=nrows)

    # Auto-detect and parse datetime columns
    for col in df.columns:
        if "datetime" in col.lower():
            df[col] = pd.to_datetime(df[col], errors="coerce")

    # Optional: strip extra spaces in headers
    df.columns = df.columns.str.strip()
    return df



def load_all_data(config):
    """
    Load all CSVs defined in config into a dict of DataFrames.
    Uses sample_data.size if available.
    """
    base = config["dataset"]["base_path"]
    files = config["dataset"]["files"]
    sample_size = config.get("sample_data", {}).get("size", None)

    data = {}
    for name, fname in files.items():
        path = os.path.join(base, fname)
        df = load_cdr(path, nrows=sample_size)
        data[name] = df
        logger.info(f"Loaded {name} -> {df.shape} from {path}")
    return data


In [23]:
def create_balanced_sample(config, random_state=42):
    """
    Create a balanced sample dataset across app, sms, voc, user files.
    Selects N fraud users and N legit users, and extracts *all their records*
    from every dataset.
    """

    # ✅ Read from 'sample_data' section instead of 'dataset'
    sample_cfg = config["sample_data"]
    base_path = sample_cfg["full_data"]
    files = sample_cfg["files"]
    sample_size = sample_cfg.get("size", 100)
    save_path = sample_cfg.get(
        "save_path",
        os.path.join(base_path, "sampled")
    )

    # --- Load user dataset first (must contain label)
    user_path = os.path.join(base_path, files["user"])
    user_df = pd.read_csv(user_path)
    user_df = user_df.dropna(subset=["label"])
    user_df["label"] = user_df["label"].astype(int)

    fraud_users = user_df[user_df["label"] == 1]
    legit_users = user_df[user_df["label"] == 0]

    # --- Pick N from each class (balanced)
    n = min(sample_size, len(fraud_users), len(legit_users))
    fraud_sample = fraud_users.sample(n, random_state=random_state)
    legit_sample = legit_users.sample(n, random_state=random_state)

    selected_users = pd.concat([fraud_sample, legit_sample])
    selected_user_ids = set(selected_users["phone_no_m"])

    print(f"✅ Selected {n} fraud + {n} legit users → {len(selected_user_ids)} total users.")

    sampled_data = {"user": selected_users.copy()}

    # --- Load and filter other datasets by selected users
    for key in ["app", "sms", "voc"]:
        path = os.path.join(base_path, files[key])
        df = pd.read_csv(path)
        before = len(df)
        df = df[df["phone_no_m"].isin(selected_user_ids)]
        after = len(df)
        pct = after / before * 100 if before > 0 else 0
        print(f"📂 {key.upper()} → {after}/{before} rows kept ({pct:.1f}%)")
        sampled_data[key] = df.copy()

    # --- Save to configured path in `sample_train_<name>.csv` format
    os.makedirs(save_path, exist_ok=True)

    for name, df in sampled_data.items():
        output_filename = f"sample_train_{name}.csv"
        output_path = os.path.join(save_path, output_filename)
        df.to_csv(output_path, index=False)
        print(f"💾 Saved {output_filename} → {len(df)} rows at {output_path}")

    return sampled_data


In [24]:
config = load_config("configs/baseline.yaml")

# Extract paths from config
sample_cfg = config["sample_data"]
full_data_path = sample_cfg["full_data"]
save_path = sample_cfg["save_path"]

# Create directories if needed
os.makedirs(save_path, exist_ok=True)

# Run the sampling process
sampled_data = create_balanced_sample(config)

# Save outputs
for name, df in sampled_data.items():
    out_file = os.path.join(save_path, f"sample_train_{name}.csv")
    df.to_csv(out_file, index=False)
    print(f"💾 Saved {out_file} → {len(df)} rows ({df['phone_no_m'].nunique()} users)")


✅ Selected 30 fraud + 30 legit users → 60 total users.
📂 APP → 26142/3283602 rows kept (0.8%)
📂 SMS → 64225/6848509 rows kept (0.9%)


  df = pd.read_csv(path)


📂 VOC → 48190/5015430 rows kept (1.0%)
💾 Saved sample_train_user.csv → 60 rows at /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/dataset/CallChinses/raw/sample/sampled/sample_train_user.csv
💾 Saved sample_train_app.csv → 26142 rows at /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/dataset/CallChinses/raw/sample/sampled/sample_train_app.csv
💾 Saved sample_train_sms.csv → 64225 rows at /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/dataset/CallChinses/raw/sample/sampled/sample_train_sms.csv
💾 Saved sample_train_voc.csv → 48190 rows at /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/dataset/CallChinses/raw/sample/sampled/sample_train_voc.csv
💾 Saved /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/dataset/CallChinses/raw/sample/sampled/sample_train_user.csv → 60 rows (60 users)
💾 Saved /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/dataset/CallChinses/raw/sample/sampled/sample_train_app.csv → 26142 rows (60 users)
💾 Save