# How to use target_merging.py

In [2]:
import pandas as pd
import numpy as np


1. load target dataframe

In [3]:
target = pd.read_csv('target.csv')
target

Unnamed: 0,APPROVAL_ID_INFO,APPLICATION_ID_INFO,FIRST_APPROVAL_ID_INFO,DECISION_CHAIN_ID_INFO,target
0,138217860,119743699,138217860,27253019,0.0
1,129199921,114818219,129199921,18154634,0.0
2,148414823,123437264,148414823,37436298,
3,135256719,118197979,135256719,24269570,0.0
4,145333119,122483029,145333119,34372022,
...,...,...,...,...,...
314252,126061074,113021896,126061074,14999720,0.0
314253,132364258,116610529,132364258,21345488,1.0
314254,148080435,123327742,148080435,37102622,0.0
314255,133950290,117468580,133950290,22946379,0.0


2. Suppose you make feature X
3. Of course you want feature X to align with Target. We do this by picking ONE of the four 'APPROVAL_ID_INFO', 'APPLICATION_ID_INFO', 'FIRST_APPROVAL_ID_INFO', 'DECISION_CHAIN_ID_INFO', whichever is available in you feature dataset.

In [6]:
def merge_on_column(result_df: pd.DataFrame, feature_df: pd.DataFrame, merge_col: str) -> pd.DataFrame:
    """
    Perform an inner merge between result_df and feature_df based on a specified common column.

    Parameters
    ----------
    result_df : pd.DataFrame
        The main dataframe with ID columns and target.
    feature_df : pd.DataFrame
        The dataframe containing the same ID column and a 'feature' column.
    merge_col : str
        One of ['APPROVAL_ID_INFO', 'APPLICATION_ID_INFO', 
                'FIRST_APPROVAL_ID_INFO', 'DECISION_CHAIN_ID_INFO'].

    Returns
    -------
    pd.DataFrame
        The merged dataframe with all NaN rows removed.
    """
    valid_cols = [
        'APPROVAL_ID_INFO', 
        'APPLICATION_ID_INFO', 
        'FIRST_APPROVAL_ID_INFO', 
        'DECISION_CHAIN_ID_INFO'
    ]
    
    if merge_col not in valid_cols:
        raise ValueError(f"merge_col must be one of {valid_cols}")
    
    # inner merge using the same column name in both DataFrames
    merged = pd.merge(result_df, feature_df, how='inner', on=merge_col)
        
    # Drop only rows where the target column is nan
    merged = merged.dropna(subset=['target']).reset_index(drop=True)
    
    return merged


* Suppose your feature looks like this and we pick "APPLICATION_ID_INFO" as the merging criteria

In [7]:
feature_df = pd.DataFrame({
    "APPLICATION_ID_INFO": [119743699, 114818219, 123437264, 118197979, 122483029],
    "feature": [1, 2, 3, 4, 5]
})

feature_df

Unnamed: 0,APPLICATION_ID_INFO,feature
0,119743699,1
1,114818219,2
2,123437264,3
3,118197979,4
4,122483029,5


* Note that merge_on_column function deletes rows where target is np.nan

In [10]:
merged_df = merge_on_column(target, feature_df, "APPLICATION_ID_INFO")
merged_df

Unnamed: 0,APPROVAL_ID_INFO,APPLICATION_ID_INFO,FIRST_APPROVAL_ID_INFO,DECISION_CHAIN_ID_INFO,target,feature
0,138217860,119743699,138217860,27253019,0.0,1
1,129199921,114818219,129199921,18154634,0.0,2
2,135256719,118197979,135256719,24269570,0.0,4
