#**Pre-request**

##Mount google drive


In [28]:
### **Mount** Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Install pakages


In [29]:
#Install pakages
%pip install -q -r /content/drive/MyDrive/Sem-6/coding/github/fraud_detection/requirements.txt --no-cache-dir
#import os; os._exit(0)   #  restart automatically

######Restart sesstion after this line at first run to make sure that the new libs are used====

In [30]:
project_path = "/content/drive/MyDrive/Sem-6/coding/github/fraud_detection/"
%cd $project_path
%ls /content/drive/MyDrive/Sem-6/coding/github/fraud_detection

/content/drive/MyDrive/Sem-6/coding/github/fraud_detection
[0m[01;34mconfigs[0m/  [01;34mnotebooks[0m/  requirements-lock.txt  [01;34mresults[0m/           [01;34msrc[0m/
[01;34mdataset[0m/  README.md   requirements.txt       run_experiment.py  [01;34mtests[0m/


##Import  libs

In [44]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import font_manager as fm
import os
import pandas as pd
import yaml
import logging
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from ydata_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pandas as pd
import numpy as np
import datetime
import warnings
import re
import sweetviz as sv
import datetime
import datetime
import os


from sklearn.preprocessing import StandardScaler
from google.colab import data_table
data_table.enable_dataframe_formatter()

# ignore warning related to font
warnings.filterwarnings(
    "ignore",
    message="Glyph .* missing from font",  # Pass the pattern as a STRING
    category=UserWarning
)
## Navigate to project location

%pip freeze > requirements-lock.txt



#Basic Methods

##Loging

In [45]:

# Make sure results directory exists
os.makedirs("results", exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("results/data_prep.log")
    ]
)
logger = logging.getLogger(__name__)



##Config

In [46]:
def load_config(config_path="configs/baseline.yaml"):
    """Load YAML config file"""
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    logger.info(f"Loaded config from {config_path}")
    return config

## CDR dataset

In [47]:
def load_cdr(file_path, nrows=None):
    """Load a CDR CSV file with optional row limit (sample mode)."""
    logger.info(f"Loading file: {file_path} with nrows={nrows}")
    return pd.read_csv(file_path, nrows=nrows)


def load_all_data(config):
    """
    Load all CSVs defined in config into a dict of DataFrames.
    Uses training.sample_size if available.
    """
    base = config["dataset"]["base_path"]
    files = config["dataset"]["files"]
    sample_size = config.get("training", {}).get("sample_size", None)

    data = {}
    for name, fname in files.items():
        path = os.path.join(base, fname)
        df = load_cdr(path, nrows=sample_size)
        data[name] = df
        logger.info(f"Loaded {name} -> {df.shape} from {path}")
    return data


#EDA

##Profling Methods

###ydata

In [48]:

def ydata_generate_profile(df, output_file="results/profile.html", mode="minimal", overwrite=True):
    """
    Generate profiling report (ydata-profiling) with safe defaults.
    """
    # Drop very high-cardinality or long-text columns in minimal mode
    if mode == "minimal":
       drop_cols = []
       for col in df.columns:
        if df[col].dtype == "object":  # only apply to categorical/text
            if df[col].nunique() > 5000 or df[col].astype(str).str.len().mean() > 100:
                drop_cols.append(col)
        elif df[col].nunique() <= 1:  # constant numeric (useless)
            drop_cols.append(col)

    if drop_cols:
        print(f"Skipping columns for profiling: {drop_cols}")
        df = df.drop(columns=drop_cols)


    # Handle overwrite manually
    if overwrite and os.path.exists(output_file):
        os.remove(output_file)

    profile = ProfileReport(
        df,
        title="Fraud Detection EDA Report",
        explorative=True,
        plot={"wordcloud": False},
        correlations={
            "pearson": {"calculate": mode == "full"},
            "spearman": {"calculate": mode == "full"},
            "kendall": {"calculate": mode == "full"},
            "phi_k": {"calculate": mode == "full"},
            "cramers": {"calculate": mode == "full"},
            "auto": {"calculate": mode == "full"},
        },
        interactions={"continuous": mode == "full"},
        missing_diagrams={"heatmap": mode == "full"},
        duplicates={"calculate": mode == "full"}
    )

    profile.to_file(output_file)  # no overwrite argument
    print(f"Profiling report saved → {output_file}")
    return output_file


###sweetviz

In [49]:
def generate_profile_sweetviz(df, output_file="results/profile_sweetviz.html", target=None):
    """
    Generate a Sweetviz report for df (with optional target column).
    """
    drop_cols = []
    for col in df.columns:
        if df[col].dtype == "object":  # only check high-cardinality categoricals
            if df[col].nunique() > 1000:
                drop_cols.append(col)
        elif df[col].dtype == "datetime64[ns]":  # datetime may break plots
            drop_cols.append(col)

    if drop_cols:
        print(f" Dropping columns for Sweetviz: {drop_cols}")
        df = df.drop(columns=drop_cols)

    report = sv.analyze(df, target_feat=target)
    report.show_html(output_file)
    print(f" Sweetviz report generated at {output_file}")
    return output_file


#pre

##App

In [50]:

def preprocess_app(df: pd.DataFrame) -> pd.DataFrame:
    """
    Preprocess the 'app' dataset.

    Fields:
      - phone_no_m: anonymized phone number (kept as ID, no transformation)
      - busi_name: application name (categorical, missing values handled)
      - flow: consumption (numeric, normalize & impute if missing)
      - month_id: month (categorical time period, impute missing)
    """

    # --- phone_no_m ---
    # Kept as is, will be used later for joins/merges

    # --- busi_name ---
    # Fill missing with "Unknown"
    if "busi_name" in df.columns:
        df["busi_name"] = df["busi_name"].fillna("Unknown").astype(str)

    # --- flow ---
    # Fill missing with 0, clip negatives, apply log normalization (optional)
    if "flow" in df.columns:
        df["flow"] = df["flow"].fillna(0)
        df["flow"] = df["flow"].clip(lower=0)  # no negative consumption
        # normalization (log1p for skewed distributions)
        df["flow_norm"] = df["flow"].apply(lambda x: np.log1p(x))

    # --- month_id ---
    # Fill missing with mode (most common month)
    if "month_id" in df.columns:
    # Convert to datetime
      df["month_id"] = pd.to_datetime(df["month_id"], errors="coerce", format="%Y-%m")

    # Forward/backward fill per user (if phone_no_m exists)
    if "phone_no_m" in df.columns:
        df["month_id"] = df.groupby("phone_no_m")["month_id"].transform(
            lambda x: x.ffill().bfill()
        )

    # If still missing, fill with global median month
    if df["month_id"].isna().sum() > 0:
        median_month = df["month_id"].dropna().median()
        df["month_id"] = df["month_id"].fillna(median_month)

    # Also keep a string version for categorical use
    df["month_str"] = df["month_id"].dt.strftime("%Y-%m")


    return df


##sms

In [57]:
def preprocess_sms(df):
    if "calltype_id" in df.columns:
      df["calltype_id"] = df["calltype_id"].astype("category")
    return df


##user

In [52]:
def preprocess_user(df):
    df["city_name"] = df["city_name"].fillna("Unknown")
    df["county_name"] = df["county_name"].fillna("Unknown")
    df["idcard_cnt"] = df["idcard_cnt"].fillna(df["idcard_cnt"].median())

    # Handle ARPU columns
    arpu_cols = [col for col in df.columns if col.startswith("arpu_")]
    for col in arpu_cols:
        df[col] = df[col].fillna(df[col].median())

    # Normalize ARPU values
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    df[arpu_cols] = scaler.fit_transform(df[arpu_cols])

    # Drop if label missing
    df = df.dropna(subset=["label"])
    return df


##Voice

In [53]:
def preprocess_voc(df):
    df["opposite_no_m"] = df["opposite_no_m"].fillna("Unknown")
    df["calltype_id"] = df["calltype_id"].fillna("Unknown")
    df["city_name"] = df["city_name"].fillna("Unknown")
    df["county_name"] = df["county_name"].fillna("Unknown")
    df["imei_m"] = df["imei_m"].fillna("Unknown")

    # Drop missing datetime
    df = df.dropna(subset=["start_datetime"])

    # Handle call duration
    df["call_dur"] = df["call_dur"].fillna(0)
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    df["call_dur"] = scaler.fit_transform(df[["call_dur"]])

    return df


#Load data

In [54]:
config = load_config("configs/baseline.yaml")

# Load raw data
data = load_all_data(config)

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")



# Excute EDA
# ===============================


In [55]:

for name, df in data.items():

    # -------------------------------
    # Load dataset-specific preprocessing config
    # -------------------------------
    preprocess_cfg = config["preprocessing"].get(name, {})

    numeric_features = preprocess_cfg.get("numeric_features", [])
    categorical_features = preprocess_cfg.get("categorical_features", [])
    datetime_features = preprocess_cfg.get("datetime_features", [])
    target = preprocess_cfg.get("target", None)

    print(f"\nPreprocessing {name}")
    print(f"   numeric: {numeric_features}")
    print(f"   categorical: {categorical_features}")
    print(f"   datetime: {datetime_features}")
    print(f"   target: {target}")

    # -------------------------------
    # Profiling with ydata_profiling
    # -------------------------------
    profiling_mode = config["training"].get("profiling_mode", "minimal")
    ydata_output = f"results/profile_{name}_{timestamp}.html"
    ydata_generate_profile(df, output_file=ydata_output, mode=profiling_mode)

    # -------------------------------
    # Profiling with Sweetviz
    # -------------------------------
    sweetviz_output = f"results/profile_{name}_sweetviz_{timestamp}.html"
    generate_profile_sweetviz(df, output_file=sweetviz_output, target=target)




Preprocessing app
   numeric: ['flow', 'month_id']
   categorical: ['busi_name']
   datetime: []
   target: None
Skipping columns for profiling: ['phone_no_m']


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/3 [00:00<?, ?it/s][A
100%|██████████| 3/3 [00:00<00:00, 28.84it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling report saved → results/profile_app_20250930_190126.html


                                             |          | [  0%]   00:00 -> (? left)

Report results/profile_app_sweetviz_20250930_190126.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
 Sweetviz report generated at results/profile_app_sweetviz_20250930_190126.html

Preprocessing sms
   numeric: []
   categorical: ['opposite_no_m', 'calltype_id']
   datetime: ['request_datetime']
   target: None
Skipping columns for profiling: ['phone_no_m', 'opposite_no_m']


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|██████████| 2/2 [00:00<00:00, 38.26it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling report saved → results/profile_sms_20250930_190126.html
 Dropping columns for Sweetviz: ['request_datetime']


                                             |          | [  0%]   00:00 -> (? left)

Report results/profile_sms_sweetviz_20250930_190126.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
 Sweetviz report generated at results/profile_sms_sweetviz_20250930_190126.html

Preprocessing user
   numeric: ['idcard_cnt', 'arpu_201908', 'arpu_201909', 'arpu_201910', 'arpu_201911', 'arpu_201912', 'arpu_202001', 'arpu_202002', 'arpu_202003']
   categorical: ['city_name', 'county_name']
   datetime: []
   target: label
Skipping columns for profiling: ['phone_no_m']


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/12 [00:00<?, ?it/s][A
 25%|██▌       | 3/12 [00:00<00:00, 25.03it/s][A
100%|██████████| 12/12 [00:00<00:00, 37.00it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling report saved → results/profile_user_20250930_190126.html
 Dropping columns for Sweetviz: ['phone_no_m']


                                             |          | [  0%]   00:00 -> (? left)

Report results/profile_user_sweetviz_20250930_190126.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
 Sweetviz report generated at results/profile_user_sweetviz_20250930_190126.html

Preprocessing voc
   numeric: ['call_dur']
   categorical: ['calltype_id', 'city_name', 'county_name', 'imei_m']
   datetime: ['start_datetime']
   target: None
Skipping columns for profiling: ['phone_no_m', 'opposite_no_m', 'city_name', 'county_name', 'imei_m']


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|██████████| 3/3 [00:00<00:00, 42.20it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling report saved → results/profile_voc_20250930_190126.html
 Dropping columns for Sweetviz: ['opposite_no_m', 'start_datetime']


                                             |          | [  0%]   00:00 -> (? left)

Report results/profile_voc_sweetviz_20250930_190126.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
 Sweetviz report generated at results/profile_voc_sweetviz_20250930_190126.html


#Excute Pre

In [58]:
for name, df in data.items():
  if name == "app":
      print(f"\nPreprocessing {name}")
      df = preprocess_app(df)
      df.to_csv(f"dataset/CallChinses/processed/cleaned_{name}_{timestamp}.csv", index=False)
      display(df.head())
  elif name == "sms":
      print(f"\nPreprocessing {name}")
      df = preprocess_sms(df)
      df.to_csv(f"dataset/CallChinses/processed/cleaned_{name}_{timestamp}.csv", index=False)
      display(df.head())

 #   df = preprocess_sms(df)
#elif name == "user":
    #df = preprocess_user(df)
#elif name == "voc":
    #df = preprocess_voc(df)





Preprocessing app


Unnamed: 0,phone_no_m,busi_name,flow,month_id,flow_norm,month_str
0,416cec0f25b93f08bfd9cff44382c5da3a6346beb16a2c...,Unknown,0.0,2019-10-01,0.0,2019-10
1,26fcb7c6f4125ee5445756d4ff5346c29f2aff7d0f2e31...,Unknown,0.0,2019-10-01,0.0,2019-10
2,1a58c82eeefdb00ea6abf2e6010a8e808b27e1802b764d...,Unknown,0.021529,2019-12-01,0.021301,2019-12
3,1a58c82eeefdb00ea6abf2e6010a8e808b27e1802b764d...,微信,0.001846,2019-12-01,0.001845,2019-12
4,1a58c82eeefdb00ea6abf2e6010a8e808b27e1802b764d...,高德导航,0.002288,2019-12-01,0.002285,2019-12



Preprocessing sms


Unnamed: 0,phone_no_m,opposite_no_m,calltype_id,request_datetime
0,0251387744988114430181c3e680a3733001a26fe686bd...,df22edbc0e3dd6bc4f2f453e687b743e8442a54834b64f...,2,2019-08-01 12:13:08
1,0251387744988114430181c3e680a3733001a26fe686bd...,df22edbc0e3dd6bc4f2f453e687b743e8442a54834b64f...,2,2019-08-01 12:13:08
2,0251387744988114430181c3e680a3733001a26fe686bd...,df22edbc0e3dd6bc4f2f453e687b743e8442a54834b64f...,2,2019-08-01 12:13:08
3,296cfae1d838070c4dd05a125a85c3d29bbb95f713c2ea...,1205bb229d750a6bcb3f9c33893b5d68c8fc8a6443b0a7...,2,2019-08-01 17:46:44
4,39de6ef3a87b8e660e42496450c54b731f3621ca708944...,972affd4be9e7596420379b7e7910843759cdfbd81315e...,2,2019-08-01 16:46:30
