In [1]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import sys 
import os

"""
Generate SMOTE minority class by 10
Down sample the majority class to the length of the minority class
"""
def downsample_majority_class(df, target_column, random_state=None):
    df = df.drop(columns=['case_id'])
    minority_class = df[df[target_column] == 1]
    print("number of minority class: ", len(minority_class))

    # Use SMOTE to upsample the minority class by x5 
    count = len(minority_class) * 10
    smote = SMOTE(sampling_strategy={1: count})
    X = df.drop(columns=target_column)
    y = df[target_column]
    X, y = smote.fit_resample(X, y)
    print("number of minority class after SMOTE: ", len(y[y == 1]))
    print("number of majority class: ", len(y[y == 0]))

    SMOTE_df = pd.concat([pd.DataFrame(X, columns=X.columns), pd.Series(y, name=target_column)], axis=1)
    SMOTE_df_majority = SMOTE_df[SMOTE_df[target_column] == 0]
    SMOTE_df_minority = SMOTE_df[SMOTE_df[target_column] == 1]
    
    # Down sample the majority class to match the number of minority class
    down_sampled_majority = SMOTE_df_majority.sample(n=len(y[y == 1]), random_state=random_state)
    balanced_df = pd.concat([down_sampled_majority, SMOTE_df_minority], axis=0)

    return balanced_df

In [2]:
from sklearn.preprocessing import StandardScaler
import numpy as np

""" Declare categorial and numerical features. Filter df down to these features and 
    case_id and target 
"""
def processMain(df, keep_col=None):
    # Read in variables from features.csv, which contains the categorical and numerical features
    f_df = pd.read_csv('features.csv')
    cat_f = f_df[f_df['type'] == 'categorical']['name'].tolist()
    num_f = f_df[f_df['type'] == 'numerical']['name'].tolist()
    display(f"categorical features: {cat_f}")
    display(f"numerical features: {num_f}")
    col_to_keep = keep_col + cat_f + num_f
    df = df[col_to_keep]
    print(f"df columns: {df.columns}")
    df.to_csv('test/TEST0.csv', index=False)

    # For categorical variables, take the most recent value
    for col in cat_f:
        for case_id in df['case_id'].unique():
            try: 
                common_val = df[df['case_id'] == case_id][col].value_counts().idxmax()
            except ValueError: 
                common_val = 0
            df.loc[df['case_id'] == case_id, col] = common_val

    # For numerical variables, take the mean of the values
    for col in num_f:
        for case_id in df['case_id'].unique():
            common_val = df[df['case_id'] == case_id][col].mean()
            df.loc[df['case_id'] == case_id, col] = common_val

    # round to 2 decimal places. Replace missing numerical values with the mean
    df.loc[:, num_f] = df[num_f].round(2)
    df.loc[:, num_f] = df[num_f].fillna(df[num_f].mean())

    df.to_csv('test/TEST1.csv', index=False)

    # Now for case_id's the data is aggregated, so now we can drop duplicates
    df = df.drop_duplicates(subset='case_id')
    df.to_csv('test/TEST2.csv', index=False)

    # one-hot encode the categorical variables
    df = pd.get_dummies(df, columns=cat_f, dtype=np.int8)

    # standardize the continuous variables
    scaler = StandardScaler()
    df[num_f] = scaler.fit_transform(df[num_f]) 

    return df


In [3]:


""" Display the number of cases with target 1 and 0 """
df = pd.read_csv(os.path.join("../", "data", "mergedDatasets", "person&ApplprevTrain.csv"))
df = processMain(df, keep_col=['case_id', 'target'])
balanced_df = downsample_majority_class(df, 'target')

print(f"\nPreprocessed dataframe info:\n")
balanced_df.info()
print(balanced_df['target'].value_counts())
balanced_df.to_csv(os.path.join('../', 'data', 'processed', 'final.csv'), index=False)
print(f'len of columns: {len(balanced_df.columns)}')


"categorical features: ['district_544M', 'mainoccupationinc_437A', 'cancelreason_3545846M', 'education_1138M', 'housetype_905L', 'language1_981M', 'familystate_447L', 'childnum_21L', 'status_219L']"

"numerical features: ['annuity_853A', 'credamount_590A', 'byoccupationinc_3656910L', 'tenor_203L']"

df columns: Index(['case_id', 'target', 'district_544M', 'mainoccupationinc_437A',
       'cancelreason_3545846M', 'education_1138M', 'housetype_905L',
       'language1_981M', 'familystate_447L', 'childnum_21L', 'status_219L',
       'annuity_853A', 'credamount_590A', 'byoccupationinc_3656910L',
       'tenor_203L'],
      dtype='object')
number of minority class:  403
number of minority class after SMOTE:  4030
number of majority class:  11187

Preprocessed dataframe info:

<class 'pandas.core.frame.DataFrame'>
Index: 8060 entries, 4274 to 15216
Columns: 1079 entries, annuity_853A to target
dtypes: float64(4), int64(1), int8(1074)
memory usage: 8.6 MB
target
0    4030
1    4030
Name: count, dtype: int64
len of columns: 1079
