<a href="https://colab.research.google.com/github/Jolayemi-momoh/miniature-bassoon/blob/main/feature_engineering_03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_df=pd.read_csv('/content/drive/MyDrive/dataorg-financial-health-prediction-challenge20251204-19827-m2tn1n/cleaned_train.csv')
test_df=pd.read_csv('/content/drive/MyDrive/dataorg-financial-health-prediction-challenge20251204-19827-m2tn1n/cleaned_test.csv')

In [None]:
#compute financial inclusion score
FI_columns=[
    'has_mobile_money',
    'has_internet_banking',
    'has_cellphone'
]
train_df['digital_finance_score']=train_df[FI_columns].apply(lambda x: x.str.contains('yes',case=False,na=False)).sum(axis=1)
test_df['digital_finance_score']=test_df[FI_columns].apply(lambda x: x.str.contains('yes',case=False,na=False)).sum(axis=1)
#

In [None]:
train_df['keeps_records_flag']= np.where(train_df['keeps_financial_records'].str.contains('Yes',
                                                                                          case=False,
                                                                                          na=False),
                                         1,0)
test_df['keeps_records_flag']= np.where(
    test_df['keeps_financial_records'].str.contains('Yes',
                                                    case=False,
                                                    na=False),
    1,0
    )
train_df['cash_flow_flag']=np.where(train_df['current_problem_cash_flow'].str.contains('Yes',case=False,na=False),-1,0)
test_df['cash_flow_flag']=np.where(test_df['current_problem_cash_flow'].str.contains('Yes',case=False,na=False),-1,0)

train_df['financial_management_score']=train_df['keeps_records_flag']+train_df['cash_flow_flag']
test_df['financial_management_score']=test_df['keeps_records_flag']+test_df['cash_flow_flag']

In [None]:
insurance_cols=['has_insurance', 'medical_insurance', 'funeral_insurance', 'motor_vehicle_insurance']

train_df['insurance_score']=train_df[insurance_cols].apply(lambda x: x.str.contains('yes',case=False,na=False)).sum(axis=1)

test_df['insurance_score']=test_df[insurance_cols].apply(lambda x: x.str.contains('yes',case=False,na=False)).sum(axis=1)

In [None]:
#opeerational profitability
train_df['operational_score']=(train_df['business_turnover_usd'] - train_df['business_expenses_usd'])/(train_df['business_turnover_usd'] +0.01)

test_df['operational_score']=(test_df['business_turnover_usd'] - test_df['business_expenses_usd'])/(test_df['business_turnover_usd'] +0.01)

In [None]:
train_df.drop(columns=['keeps_records_flag','cash_flow_flag'],inplace=True)

In [None]:
test_df.drop(columns=['keeps_records_flag','cash_flow_flag'],inplace=True)

In [None]:
test_df.info()

In [None]:
informal_cols = ['uses_informal_lender', 'uses_friends_family_savings', 'problem_sourcing_money']
train_df['informal_credit_reliance'] = train_df[informal_cols].apply(lambda x: x.str.contains('Yes|True', case=False, na=False, regex=True)).sum(axis=1)
test_df['informal_credit_reliance'] = test_df[informal_cols].apply(lambda x: x.str.contains('Yes| True', case=False, na=False,regex=True)).sum(axis=1)

In [None]:
optimism_cols = ['attitude_more_successful_next_year', 'attitude_satisfied_with_achievement', 'attitude_stable_business_environment']
train_df['optimism_score'] = train_df[optimism_cols].apply(lambda x: x.str.contains('Yes|Agree|Strongly Agree', case=False, na=False, regex=True)).sum(axis=1)

In [None]:
test_df['optimism_score']=test_df[optimism_cols].apply(lambda x: x.str.contains('Yes|Agree|Strobgly Agree',case=False,na=False,regex=True)).sum(axis=1)

In [None]:
#penalize pessimism
train_df['fear_flag'] = np.where(train_df['attitude_worried_shutdown'].str.contains('Yes|Agree', case=False, na=False, regex=True), 1, 0)
train_df['founder_confidence_index'] = train_df['optimism_score'] - train_df['fear_flag']

In [None]:
test_df['fear_flag']=np.where(test_df['attitude_worried_shutdown'].str.contains('Yes|Agree',case=False,na=False,regex=True),1,0)
test_df['founder_confidence_index']=test_df['optimism_score']-test_df['fear_flag']

In [None]:
train_df.drop(columns=['fear_flag'],inplace=True)
test_df.drop(columns=['fear_flag'],inplace=True)

In [None]:
num_cols = ['owner_age', 'business_age', 'personal_income_usd', 'business_expenses_usd', 'business_turnover_usd']
train_medians = train_df[num_cols].median()
cat_cols = test_df.select_dtypes(include='object').columns
for col in num_cols:
    test_df[col] = test_df[col].fillna(train_medians[col])
for col in cat_cols:
    test_df[col] = test_df[col].fillna("Unknown")


In [None]:
y_train = train_df['Target']
X_train = train_df.drop(columns=['Target'])
X_test = test_df.copy()

In [None]:
X_train_encoded = pd.get_dummies(X_train, drop_first=True)
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

In [None]:
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

In [None]:
from sklearn.model_selection import train_test_split
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_encoded,
    y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
num_cols_to_scale = [
    'owner_age',
    'business_age',
    'personal_income_usd',
    'business_expenses_usd',
    'business_turnover_usd'
]
X_train_final[num_cols_to_scale] = scaler.fit_transform(X_train_final[num_cols_to_scale])

In [None]:
X_val[num_cols_to_scale] = scaler.transform(X_val[num_cols_to_scale])
X_test_encoded[num_cols_to_scale] = scaler.transform(X_test_encoded[num_cols_to_scale])

In [None]:
folder_path='/content/drive/MyDrive/dataorg-financial-health-prediction-challenge20251204-19827-m2tn1n/'
X_train_final.to_csv(folder_path + 'X_train_scaled.csv', index=False)
X_val.to_csv(folder_path + 'X_val_scaled.csv', index=False)

In [None]:
y_train_final.to_frame().to_csv(folder_path + 'y_train.csv', index=False)
y_val.to_frame().to_csv(folder_path + 'y_val.csv', index=False)

In [None]:
X_test_encoded.to_csv(folder_path + 'X_test_scaled.csv', index=False)