# 3. Feature Engineering<a id='3_Feature Engineering'></a>

## Data Imports

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
# Load dataset
df = pd.read_csv("Data//data.csv")

#_________ Data manipulation in data wrangling part
# Function to clean column names
def clean_column_names(col):
    col = col.replace(' (times)', '') # Remove ' (times)'
    col = col.replace(' (Yuan ¥)', '') # Remove ' (Yuan ??)'
    col = col.replace('?', '')  # Remove question marks
    col = col.strip()  # Remove leading and trailing spaces
    col = col.replace(' ', '_')  # Replace spaces with underscores
    col = col.replace('%', 'pct')  # Replace % with 'percent'
    col = col.replace('/', '_to_')  # Replace / with '_to_'
    col = col.replace('(', '')  # Remove opening parentheses
    col = col.replace(')', '')  # Remove closing parentheses
    col = re.sub(r'_([a-z])', lambda x: '_' + x.group(1).upper(), col) # Convert first letter after underscore to uppercase
    return col

# Apply the function to all column names
df.columns = [clean_column_names(col) for col in df.columns]

# Convert binary variables to categorical
binary_columns = ['Bankrupt', 'Net_Income_Flag', 'Liability-Assets_Flag']
df[binary_columns] = df[binary_columns].astype('category')

# Remove features having same values with different names
df.drop(columns = ['Net_Income_Flag', 'Current_Liabilities_To_Equity', 'Current_Liabilities_To_Liability'], inplace=True)

## 3.1 Engineering on Numerical Features <a id='3.1_Engineering_on_Numerical_Features'></a>

### 3.1.1 Train-Test Split

To prevent data leakage, first perform a train-test split. In the case of severe target variable imbalance, use the stratify argument to ensure that each split maintains the same proportion of target variable classes as the original dataset.

In [3]:
X = df.drop(columns=['Bankrupt'])  # Features
y = df['Bankrupt']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(y_train.value_counts()/len(y_train))
print(y_test.value_counts()/len(y_test))

Bankrupt
0    0.967736
1    0.032264
Name: count, dtype: float64
Bankrupt
0    0.967742
1    0.032258
Name: count, dtype: float64


### 3.1.2 Normalizing Numeric Features: Min-Max Normalization

In [4]:
# Normalizing Numeric Features: Min-Max Normalization
X_train_num = X_train.select_dtypes(include=['number'])
X_train_cat = X_train[['Liability-Assets_Flag']]
X_test_num = X_test.select_dtypes(include=['number'])
X_test_cat = X_test[['Liability-Assets_Flag']]

# Initialize the scaler and Fit scaler with training numeric features
scaler = MinMaxScaler()
scaler.fit(X_train_num)

# Transform the numerical features in both training and test sets
X_train_num_scaled = scaler.transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

# Convert scaled numerical features back to DataFrame
X_train_num_scaled = pd.DataFrame(X_train_num_scaled, columns=X_train_num.columns, index=X_train.index)
X_test_num_scaled = pd.DataFrame(X_test_num_scaled, columns=X_test_num.columns, index=X_test.index)

# Combine scaled numerical features with the categorical features
X_train_scaled = pd.concat([X_train_num_scaled, X_train_cat], axis=1)
X_test_scaled = pd.concat([X_test_num_scaled, X_test_cat], axis=1)


In [5]:
X_train_scaled.shape

(5455, 92)

### 3.1.3 New Features Generation

#### 3.1.3.1 Ratio between Features

Although there are already 92 features, new features can be created and then selected through feature selection to identify the most useful ones. As observed in the EDA stage, profitability and activity indicators, such as Net Income to Total Assets and Asset Turnover Ratio, and stability indicators like Debt Ratio are expected to be useful for bankruptcy prediction. 

Additionally, if financial stability deteriorates while profitability and activity remains low, the probability of bankruptcy is expected to be higher. Considering this, features created by dividing stability indicators by profitability and activity indicators are likely to improve model performance. 

However, given the large number of possible combinations of those indicators, it is impractical to consider every combination. Moreover, high correlation was observed among features within the same category. Therefore, a few key stability indicators will be used to create new features by dividing them by a representative profitability and activity features, Net_Income_To_Total_Assets and Total_Asset_Turnover.

In [6]:
# New Feature Generation
stability_features = [
    'Debt_Ratio_Pct', 
    'Borrowing_Dependency', 
    'Cash_To_Current_Liability', 
    'Fixed_Assets_To_Assets',
    'Current_Ratio',
    'Quick_Ratio',
    'Total_Debt_To_Total_Net_Worth',
    'Net_Worth_To_Assets',
    'Long-term_Fund_Suitability_Ratio_A',
    'Contingent_Liabilities_To_Net_Worth',
    'Operating_Funds_To_Liability',
    'Current_Liability_To_Assets',
    'Current_Liability_To_Liability',
    'Current_Liability_To_Equity',
    'Long-term_Liability_To_Current_Assets',
    'Retained_Earnings_To_Total_Assets',
    'Liability_To_Equity',
    'Degree_Of_Financial_Leverage_DFL'
]

profitability_feature = 'Net_Income_To_Total_Assets'
activity_feature = 'Total_Asset_Turnover'

# Generate New Features
for feature in stability_features:
    new_feature_name = f'{feature}_To_NIA'
    X_train_scaled[new_feature_name] = (X_train_scaled[feature]+0.1) / (X_train_scaled[profitability_feature]+0.1)
    X_test_scaled[new_feature_name] = (X_test_scaled[feature]+0.1) / (X_test_scaled[profitability_feature]+0.1)
    
for feature in stability_features:
    new_feature_name = f'{feature}_To_ATO'
    X_train_scaled[new_feature_name] = (X_train_scaled[feature]+0.1) / (X_train_scaled[activity_feature]+0.1)
    X_test_scaled[new_feature_name] = (X_test_scaled[feature]+0.1) / (X_test_scaled[activity_feature]+0.1)

In [7]:
X_train_scaled.shape

(5455, 128)

In [8]:
# Rescale Features including newly generated features
X_train_num = X_train_scaled.select_dtypes(include=['number'])
X_train_cat = X_train_scaled[['Liability-Assets_Flag']] 

X_test_num = X_test_scaled.select_dtypes(include=['number'])
X_test_cat = X_test_scaled[['Liability-Assets_Flag']]

# Initialize the scaler and Fit scaler with training numeric features
scaler = StandardScaler()
scaler.fit(X_train_num)

# Transform the numerical features in both training and test sets
X_train_num_scaled = scaler.transform(X_train_num)
X_test_num_scaled = scaler.transform(X_test_num)

# Convert scaled numerical features back to DataFrame
X_train_num_scaled = pd.DataFrame(scaler.transform(X_train_num), columns=X_train_num.columns)
X_test_num_scaled = pd.DataFrame(scaler.transform(X_test_num), columns=X_test_num.columns)

# Combine scaled numerical features with the categorical features
X_train_scaled = pd.concat([X_train_num_scaled, X_train_cat.reset_index(drop=True)], axis=1)
X_test_scaled = pd.concat([X_test_num_scaled, X_test_cat.reset_index(drop=True)], axis=1)

In [9]:
X_train_scaled.head()

Unnamed: 0,ROAC_Before_Interest_And_Depreciation_Before_Interest,ROAA_Before_Interest_And_Pct_After_Tax,ROAB_Before_Interest_And_Depreciation_After_Tax,Operating_Gross_Margin,Realized_Sales_Gross_Margin,Operating_Profit_Rate,Pre-tax_Net_Interest_Rate,After-tax_Net_Interest_Rate,Non-industry_Income_And_Expenditure_To_Revenue,Continuous_Interest_Rate_After_Tax,...,Contingent_Liabilities_To_Net_Worth_To_ATO,Operating_Funds_To_Liability_To_ATO,Current_Liability_To_Assets_To_ATO,Current_Liability_To_Liability_To_ATO,Current_Liability_To_Equity_To_ATO,Long-term_Liability_To_Current_Assets_To_ATO,Retained_Earnings_To_Total_Assets_To_ATO,Liability_To_Equity_To_ATO,Degree_Of_Financial_Leverage_DFL_To_ATO,Liability-Assets_Flag
0,-0.176677,-0.117303,-0.135577,-0.542117,-0.541389,0.014704,0.016414,0.016847,-0.007591,0.014885,...,-0.362729,-0.362977,-0.778076,-0.110488,-0.399941,-0.261335,-0.370677,-0.408036,-0.342673,0
1,0.49622,0.504173,0.333551,-0.496455,-0.503294,0.017875,0.017367,0.017245,-0.013713,0.015315,...,-1.657855,-1.690611,-1.20241,-1.457142,-1.701251,-0.927291,-1.720584,-1.691694,-1.529255,0
2,0.363741,0.444945,0.221065,-0.060126,-0.058848,0.023791,0.030626,0.027844,-0.005259,0.024135,...,-0.145872,-0.081274,-0.577103,-0.135072,-0.16764,-0.146058,-0.109227,-0.17531,-0.137671,0
3,-0.575729,-0.320846,-0.519251,-0.72688,-0.726363,0.013707,0.011745,0.012834,-0.013214,0.010887,...,-0.790737,-0.811649,0.291133,-0.089712,-0.742929,-0.473459,-0.79958,-0.756579,-0.707389,0
4,-1.870631,-1.753995,-1.962388,3.328187,3.333334,-0.215758,-0.275531,-0.282722,0.051511,-0.318496,...,2.728161,1.591047,0.626918,4.139132,2.724848,1.313105,2.757654,2.682001,2.462514,0


#### 3.1.3.2 Outlier Dummy Features

During the exploratory data analysis (EDA), it was observed that outliers in some features were significant for predicting the target variable. Therefore, dummy features were created to indicate whether these variables were outliers, particularly those with a high bankruptcy ratio.

The process involved creating dummy variables based on outlier detection for numerical features in the dataset. First, the z-scores for each numerical feature were calculated to standardize the data, which allows for the identification of outliers. An outlier was defined as a data point with a z-score above or below a certain threshold, typically 3 or -3. For each feature, a dummy variable was created where a value of 1 was assigned if the data point was an outlier, and 0 otherwise.

In [10]:
def create_outlier_dummies(X_train_scaled, y_train, X_test_scaled, threshold, prop):
    """
    Create outlier dummy variables for numerical features and apply to training and test data.

    Returns:
    pd.DataFrame: Training data with outlier dummy variables.
    pd.DataFrame: Test data with outlier dummy variables.
    """
    # Ensure both X_train_scaled and X_test_scaled are DataFrames
    if not isinstance(X_train_scaled, pd.DataFrame) or not isinstance(X_test_scaled, pd.DataFrame):
        raise ValueError("X_train_scaled and X_test_scaled must be pandas DataFrames.")
    
    # Ensure y_train is a Series
    if not isinstance(y_train, pd.Series):
        raise ValueError("y_train must be a pandas Series.")
    
    # Convert y_train to numeric if it's categorical
    if y_train.dtype.name == 'category':
        y_train = y_train.cat.codes

    # Ensure the indices align
    y_train = y_train.reindex(X_train_scaled.index)

    # Identify numerical features
    numerical_features = X_train_scaled.select_dtypes(include=[np.number]).columns.tolist()
    
    # Initialize DataFrames for dummy variables
    X_train_dummies = X_train_scaled.copy()
    X_test_dummies = X_test_scaled.copy()

    for feature in numerical_features:
        # Get the feature values
        feature_train_values = X_train_scaled[feature]
        feature_test_values = X_test_scaled[feature]

        # Determine outliers
        pos_outliers_train = feature_train_values > threshold
        neg_outliers_train = feature_train_values < -threshold
        pos_outliers_test = feature_test_values > threshold
        neg_outliers_test = feature_test_values < -threshold
        
        # Calculate proportions
        pos_outlier_proportion = y_train[pos_outliers_train].mean() if pos_outliers_train.any() else 0
        neg_outlier_proportion = y_train[neg_outliers_train].mean() if neg_outliers_train.any() else 0

        # Determine if outliers are significant
        pos_outlier_significant = pos_outlier_proportion > prop * 3
        neg_outlier_significant = neg_outlier_proportion > prop * 3

        # Create dummy variables
        if pos_outlier_significant:
            X_train_dummies[f'{feature}_pos_outlier_D'] = np.where(pos_outliers_train, 1, 0)
            X_test_dummies[f'{feature}_pos_outlier_D'] = np.where(pos_outliers_test, 1, 0)

        if neg_outlier_significant:
            X_train_dummies[f'{feature}_neg_outlier_D'] = np.where(neg_outliers_train, 1, 0)
            X_test_dummies[f'{feature}_neg_outlier_D'] = np.where(neg_outliers_test, 1, 0)

    return X_train_dummies, X_test_dummies

In [11]:
X_train_with_dummies, X_test_with_dummies = create_outlier_dummies(
    X_train_scaled, y_train, X_test_scaled, threshold=2, prop=0.032
)

In [12]:
Dummies = [col for col in X_train_with_dummies.columns if col.endswith('_D')]
X_train_with_dummies['prop_dummy'] = X_train_with_dummies[Dummies].mean(axis=1)
X_test_with_dummies['prop_dummy'] = X_test_with_dummies[Dummies].mean(axis=1)

In [13]:
print(X_train_with_dummies.shape)
print(X_test_with_dummies.shape)

(5455, 158)
(1364, 158)
