# Train Conversion Model - Data Preparation
This notebook loads and preprocesses JSON data for Conversion model training.

In [18]:
# reload preprocessing_pipeline
import importlib
import preprocessing_pipeline
importlib.reload(preprocessing_pipeline)
import sys
sys.path.append('/home/azureuser/cloudfiles/code/Users/yangyi/gen4_Models')
from preprocessing_pipeline import save_csv_safe, add_target_label, stratified_split_by_date, process_high_cardinality_features, process_dates, convert_object_to_numeric, encode_categorical
import pandas as pd
import os
import json

In [11]:
# Path to folder containing unzipped JSONs
input_folder = '/home/azureuser/cloudfiles/code/Users/yangyi/gen4_training_data/Accepted_gen4Input_JSON_V1'
json_files = [f for f in os.listdir(input_folder) if f.endswith('.json')]

In [12]:
# Load JSONs, extract PortfolioID and Application_ID, and filter by application date
data_list = []
portfolio_ids = []
application_ids = []
for filename in json_files:
    parts = filename.split('_')
    if len(parts) >= 2:
        portfolio_id = parts[0]
        app_id = parts[1]
    else:
        continue  # skip files not matching pattern
    file_path = os.path.join(input_folder, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        try:
            data = json.load(f)
            app_date = pd.to_datetime(data.get('i01_ApplicationDate', None), errors='coerce')
            if app_date is not pd.NaT and app_date >= pd.Timestamp('2025-03-01'):
                data_list.append(data)
                portfolio_ids.append(portfolio_id)
                application_ids.append(app_id)
        except Exception as e:
            print(f"Skipping {filename}: {e}")
df = pd.DataFrame(data_list)
df['PortfolioID'] = portfolio_ids
df['Application_ID'] = application_ids
print(f"Loaded {len(df)} rows with ApplicationDate >= 2025-03-01.")

Skipping DC_88932938_gen4_input_attributes.json: Expecting value: line 1 column 1 (char 0)
Skipping DC_88933066_gen4_input_attributes.json: Expecting value: line 1 column 1 (char 0)
Skipping DC_88933259_gen4_input_attributes.json: Expecting value: line 1 column 1 (char 0)
Skipping DC_88933372_gen4_input_attributes.json: Expecting value: line 1 column 1 (char 0)
Skipping DC_88933598_gen4_input_attributes.json: Expecting value: line 1 column 1 (char 0)
Skipping DC_88933653_gen4_input_attributes.json: Expecting value: line 1 column 1 (char 0)
Loaded 26551 rows with ApplicationDate >= 2025-03-01.
Loaded 26551 rows with ApplicationDate >= 2025-03-01.


In [13]:
path = '/home/azureuser/cloudfiles/code/Users/yangyi/gen4_training_data/raw_accepted_202503-07.csv'
save_csv_safe(df, path)

In [None]:
target_path = '/home/azureuser/cloudfiles/code/Users/yangyi/gen4_Models/gen4_Conv/gen4_Conv_target.csv'
target_col = 'Originated'
data = add_target_label(df, target_path, target_col=target_col)
save_csv_safe(data, '/home/azureuser/cloudfiles/code/Users/yangyi/gen4_training_data/raw_accepted_202503-07_with_target.csv')

Null target rows: 54 (0.20%)


In [23]:
test_dates = data['i01_ApplicationDate']

print("Min date:", test_dates.min())
print("Max date:", test_dates.max())

Min date: 2025-03-10 12:43:30
Max date: 2025-06-30 19:59:46


In [21]:
train, test = stratified_split_by_date(data, date_col='i01_ApplicationDate', split_date='2025-06-01')
# save raw_train, raw_test
print(train.shape)
print(test.shape)
display(train.head(5))
display(test.head(5))

(21560, 1337)
(4937, 1337)


Unnamed: 0,i01_ApplicationDate,i01_DOB,i01_Frequency,i01_NetMonthlyIncome,i01_State,i01_NextPayDate,i01_IsHomeOwner,i01_BankAccountNum,i01_BankABA,i01_City,...,v02_Days_Since_First_Activity,v02_Days_since_Last_Fraud_Transaction,v02_Days_Since_Last_Activity,v02_Count_of_Account_Openings,v02_Sum_of_Approved_Tranaction_Last_7D,v02_Days_Since_Last_Decline_Payment,v02_Sum_of_Approved_Check_Transaction_Last_30D,PortfolioID,Application_ID,Originated
0,2025-03-10 12:43:30,1952-10-06 00:00:00,S,6500.0,LA,NoMatch,1,2542538,65205264,Haughton,...,0,7127,0,0,25,0,25,7,100000048,0.0
1,2025-03-10 12:43:54,1946-05-23 00:00:00,M,1500.0,WI,NoMatch,1,675825,75000022,New London,...,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,7,100000080,0.0
2,2025-03-10 12:56:59,1992-01-27 00:00:00,B,3791.669921875,MO,NoMatch,1,4755442374,43002900,Kansas City,...,0,0,0,0,25,0,25,7,100000936,0.0
3,2025-03-10 12:57:59,1971-01-15 00:00:00,B,13000.0,TX,NoMatch,2,4027248,113125995,Crane,...,427,5428,5428,0,25,0,25,7,100001008,1.0
4,2025-03-10 12:58:07,1969-09-21 00:00:00,B,9750.0,FL,NoMatch,1,400603739,21000021,Miramar,...,4198,4199,4199,0,25,0,25,7,100001015,0.0


Unnamed: 0,i01_ApplicationDate,i01_DOB,i01_Frequency,i01_NetMonthlyIncome,i01_State,i01_NextPayDate,i01_IsHomeOwner,i01_BankAccountNum,i01_BankABA,i01_City,...,v02_Days_Since_First_Activity,v02_Days_since_Last_Fraud_Transaction,v02_Days_Since_Last_Activity,v02_Count_of_Account_Openings,v02_Sum_of_Approved_Tranaction_Last_7D,v02_Days_Since_Last_Decline_Payment,v02_Sum_of_Approved_Check_Transaction_Last_30D,PortfolioID,Application_ID,Originated
0,2025-06-01 02:47:09,1991-10-18 00:00:00,B,3618.330078125,MI,NoMatch,2,30410862,272078365,Macomb,...,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,7,104346013,0.0
1,2025-06-01 05:25:13,1961-10-13 00:00:00,B,3501.330078125,FL,NoMatch,2,7433755514,67091719,Bonita Springs,...,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,7,104346014,0.0
2,2025-06-01 08:27:12,1962-07-01 00:00:00,B,3466.669921875,MI,NoMatch,4,7997527283,41200050,Lambertville,...,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,7,104346016,0.0
3,2025-06-01 10:21:29,1981-09-22 00:00:00,B,3501.330078125,IL,NoMatch,2,92040829,271187875,Peoria Heights,...,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,7,104346020,0.0
4,2025-06-01 17:04:55,1976-04-13 00:00:00,W,2499.989990234375,IN,NoMatch,2,5315840941,42200910,Butlerville,...,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,NoMatch,7,104346032,0.0


In [22]:
save_csv_safe(train, '/home/azureuser/cloudfiles/code/Users/yangyi/gen4_training_data/raw_accepted_train_202503-07.csv')
save_csv_safe(test, '/home/azureuser/cloudfiles/code/Users/yangyi/gen4_training_data/raw_accepted_test_202503-07.csv')

In [None]:
# Preprocess features
df, dropped_high_cardinality = process_high_cardinality_features(df)
df, dropped_dates = process_dates(df)
df, numeric_dict = convert_object_to_numeric(df)
df, categorical_dict = encode_categorical(df)
print("Preprocessing complete.")

In [None]:
# Save processed data for model training
output_path = '/home/azureuser/cloudfiles/code/gen4_training_data/conversion_train_processed.csv'
df.to_csv(output_path, index=False)
print(f"Processed data saved to {output_path}")