
# **Mount** Google Drive
# ===============================


In [11]:
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).



# Install
# ===============================


In [12]:

#!ls /content/drive/MyDrive/Sem-6/coding/github/fraud_detection
project_path = "/content/drive/MyDrive/Sem-6/coding/github/fraud_detection/"
%cd $project_path
#!pip install -r requirements.txt
!pip install -r requirements.txt --upgrade


/content/drive/MyDrive/Sem-6/coding/github/fraud_detection
Collecting pandas<3.0.0,>=2.0.0 (from -r requirements.txt (line 1))
  Using cached pandas-2.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting tsfresh (from -r requirements.txt (line 9))
  Using cached tsfresh-0.21.1-py2.py3-none-any.whl.metadata (2.6 kB)
INFO: pip is looking at multiple versions of tsfresh to determine which version is compatible with other requirements. This could take a while.
  Using cached tsfresh-0.21.0-py2.py3-none-any.whl.metadata (2.6 kB)
  Using cached tsfresh-0.20.3-py2.py3-none-any.whl.metadata (2.6 kB)


##import

In [13]:
import os
import pandas as pd
import yaml
import logging
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np


#Helper Code

#Loging

In [14]:
# Make sure results directory exists
os.makedirs("results", exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("results/data_prep.log")
    ]
)
logger = logging.getLogger(__name__)



##Load config

In [15]:
def load_config(config_path="configs/baseline.yaml"):
    """Load YAML config file"""
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    logger.info(f"Loaded config from {config_path}")
    return config

##Load  CDRs

In [16]:
def load_cdr(file_path, nrows=None):
    """Load a CDR CSV file with optional row limit (sample mode)."""
    logger.info(f"Loading file: {file_path} with nrows={nrows}")
    return pd.read_csv(file_path, nrows=nrows)


def load_all_data(config):
    """
    Load all CSVs defined in config into a dict of DataFrames.
    Uses training.sample_size if available.
    """
    base = config["dataset"]["base_path"]
    files = config["dataset"]["files"]
    sample_size = config.get("training", {}).get("sample_size", None)

    data = {}
    for name, fname in files.items():
        path = os.path.join(base, fname)
        df = load_cdr(path, nrows=sample_size)
        data[name] = df
        logger.info(f"Loaded {name} -> {df.shape} from {path}")
    return data


##Build Features

In [17]:

def build_feature_table(data: dict):
    """
    Merge APP, SMS, USER, VOC datasets on phone_no_m into a single feature table.
    Each dataset is aggregated per phone_no_m, then merged into one DataFrame.

    Parameters
    ----------
    data : dict
        Dictionary of raw DataFrames, e.g. {"app": df_app, "sms": df_sms, "user": df_user, "voc": df_voc}

    Returns
    -------
    feature_df : pd.DataFrame
        Merged feature table with one row per phone_no_m
    """



    feature_parts = []

    # --------------------------
    # APP features
    # --------------------------
    if "app" in data:
        df = data["app"].copy()
        app_feat = df.groupby("phone_no_m").agg(
            app_count=("busi_name", "nunique"),
            total_flow=("flow", "sum"),
            avg_flow=("flow", "mean")
        ).reset_index()
        feature_parts.append(app_feat)

    # --------------------------
    # SMS features
    # --------------------------
    if "sms" in data:
        df = data["sms"].copy()
        sms_feat = df.groupby("phone_no_m").agg(
            sms_count=("opposite_no_m", "count"),
            unique_contacts=("opposite_no_m", "nunique")
        ).reset_index()
        feature_parts.append(sms_feat)

    # --------------------------
    # USER features (static profile)
    # --------------------------
    if "user" in data:
        df = data["user"].copy()
        user_feat = df.drop_duplicates(subset=["phone_no_m"])
        feature_parts.append(user_feat)

    # --------------------------
    # VOC features (calls)
    # --------------------------
    if "voc" in data:
        df = data["voc"].copy()
        voc_feat = df.groupby("phone_no_m").agg(
            call_count=("opposite_no_m", "count"),
            unique_callers=("opposite_no_m", "nunique"),
            avg_call_dur=("call_dur", "mean"),
            total_call_dur=("call_dur", "sum")
        ).reset_index()
        feature_parts.append(voc_feat)

    # --------------------------
    # Merge all features
    # --------------------------
    from functools import reduce
    feature_df = reduce(
        lambda left, right: pd.merge(left, right, on="phone_no_m", how="outer"),
        feature_parts
    )

    # --------------------------
    # Handle missing values
    # --------------------------
    feature_df = feature_df.fillna(0)

    return feature_df


##Genrate profile

In [18]:
def generate_profile(df, output_file="results/profile.html"):
    """
    Safe profiling report generator without correlation/statistical crashes.
    """
    # Drop high-cardinality or text-heavy columns
    drop_cols = [
        col for col in df.columns
        if df[col].nunique() > 500 or df[col].astype(str).str.len().mean() > 50
    ]
    if drop_cols:
        print(f"⚠️ Skipping columns for profiling: {drop_cols}")
        df = df.drop(columns=drop_cols)

    profile = ProfileReport(
        df,
        title="Fraud Detection EDA Report",
        explorative=True,
        plot={"wordcloud": False},
        correlations={ "pearson": {"calculate": False},
                      "spearman": {"calculate": False},
                      "kendall": {"calculate": False},
                      "phi_k": {"calculate": False},
                      "cramers": {"calculate": False},
                      "auto": {"calculate": False}},  # 🚫 disable ALL correlations
        interactions={"continuous": False},
        missing_diagrams={"heatmap": False}
    )

    profile.to_file(output_file)
    print(f"✅ Profiling report saved → {output_file}")
    return output_file


##Preprocess_features

In [19]:
def preprocess_features(df, numeric_cols, categorical_cols):
    """
    Scale numeric features and one-hot encode categorical features.
    Ensures categorical columns are converted to string type.
    Compatible with sklearn <1.2 and >=1.2.
    """
    # Scale numeric
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    # Ensure categorical cols are all strings
    df[categorical_cols] = df[categorical_cols].astype(str)

    # OneHotEncoder compatibility
    try:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")  # sklearn >=1.2
    except TypeError:
        encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")         # sklearn <1.2

    encoded = encoder.fit_transform(df[categorical_cols])
    encoded_df = pd.DataFrame(
        encoded,
        columns=encoder.get_feature_names_out(categorical_cols),
        index=df.index
    )

    # Combine
    df = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)
    return df

#


# Load config +  dataset
# ===============================


In [21]:
config = load_config("configs/baseline.yaml")

# Load raw data
data = load_all_data(config)

# Build feature table
feature_df = build_feature_table(data)

# Generate professional profile
generate_profile(feature_df, output_file="results/feature_profile.html")

# Preprocess features
numeric = config["preprocessing"]["numeric_features"]
categorical = config["preprocessing"]["categorical_features"]

#if numeric or categorical:
  #  feature_df = preprocess_features(feature_df, numeric, categorical)
  #  print(f"✅ Preprocessed features: {feature_df.shape}")
 #   display(feature_df.head())


⚠️ Skipping columns for profiling: ['phone_no_m']


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/13 [00:00<?, ?it/s][A
100%|██████████| 13/13 [00:00<00:00, 57.27it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Profiling report saved → results/feature_profile.html
