In [56]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import sys 
import os

def downsample_majority_class(df, target_column, random_state=None):
    """
    Downsample the majority class in a DataFrame with imbalanced classes.
    """
    majority_class = df[df[target_column] == 0]
    minority_class = df[df[target_column] == 1]
    print("number of minority class: ", len(minority_class))
    downsampled_majority = majority_class.sample(n=len(minority_class), random_state=random_state)
    downsampled_df = pd.concat([downsampled_majority, minority_class], ignore_index=True)
    return downsampled_df

# smote = SMOTE(sampling_strategy={1: 50000})
# X = df.drop(columns='RainTomorrow')
# y = df['RainTomorrow']
# X, y = smote.fit_resample(X, y)

# # Apply random undersampling to the majority class (class 0)
# undersampler = RandomUnderSampler(sampling_strategy='majority')
# X_resampled, y_resampled = undersampler.fit_resample(X, y)

# # Concatenate the downsampled data into a new DataFrame
# df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='RainTomorrow')], axis=1)




In [57]:
from sklearn.preprocessing import StandardScaler
import numpy as np

""" Declare categorial and numerical features. Filter df down to these features and 
    case_id and target 
"""
def processMain(df):
    # Read in variables from features.csv, which contains the categorical and numerical features
    f_df = pd.read_csv('features.csv')
    cat_f = f_df[f_df['type'] == 'categorical']['name'].tolist()
    num_f = f_df[f_df['type'] == 'numerical']['name'].tolist()
    display(f"categorical features: {cat_f}")
    display(f"numerical features: {num_f}")
    col_to_keep = ['case_id', 'target'] + cat_f + num_f
    df = df[col_to_keep]
    print(f"df columns: {df.columns}")
    df.to_csv('TEST0.csv', index=False)

    # For categorical variables, take the most recent value
    for col in cat_f:
        for case_id in df['case_id'].unique():
            common_val = df[df['case_id'] == case_id][col].value_counts().idxmax()
            df.loc[df['case_id'] == case_id, col] = common_val
            # print(f"common_val: {common_val}")

    # For numerical variables, take the mean of the values
    for col in num_f:
        for case_id in df['case_id'].unique():
            common_val = df[df['case_id'] == case_id][col].mean()
            df.loc[df['case_id'] == case_id, col] = common_val

    # round to 2 decimal places. Replace missing numerical values with the mean
    df.loc[:, num_f] = df[num_f].round(2)
    df.loc[:, num_f] = df[num_f].fillna(df[num_f].mean())

    df.to_csv('TEST1.csv', index=False)

    # Now for case_id's the data is aggregated, so now we can drop duplicates
    df = df.drop_duplicates(subset='case_id')
    df.to_csv('TEST2.csv', index=False)

    # one-hot encode the categorical variables
    df = pd.get_dummies(df, columns=cat_f, dtype=np.int8)

    # standardize the continuous variables
    scaler = StandardScaler()
    df[num_f] = scaler.fit_transform(df[num_f]) 

    df.to_csv('cleansed_df.csv', index=False)

In [71]:

df = pd.read_csv(os.path.join("../", "data", "mergedDatasets", "person&Applprev.csv"))

# Figure out how to aggregate the data.
# df = df.groupby('case_id').agg({'target': 'max', 'case_id': 'count'}).reset_index()
# df.to_csv('TEST.csv', index=False)
# sys.exit()
# df = df.drop_duplicates(subset=['case_id'])

""" Display the number of cases with target 1 and 0 """
# df = df[df['target'] == 1]
# df.to_csv(os.path.join(rootPath, 'data', 'processed', 'target_1.csv'), index=False)

processMain(df)
p_df = pd.read_csv('cleansed_df.csv')
# df = downsample_majority_class(df, 'target')
print(f"\nPreprocessed dataframe info:\n")
p_df.info()
print(p_df['target'].value_counts())
# df.to_csv(os.path.join(rootPath, 'data', 'processed', 'downsampled.csv'), index=False)


"categorical features: ['district_544M', 'mainoccupationinc_437A', 'cancelreason_3545846M']"

"numerical features: ['annuity_853A', 'credamount_590A']"

df columns: Index(['case_id', 'target', 'district_544M', 'mainoccupationinc_437A',
       'cancelreason_3545846M', 'annuity_853A', 'credamount_590A'],
      dtype='object')

Preprocessed dataframe info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Columns: 101 entries, case_id to cancelreason_3545846M_a55475b1
dtypes: float64(2), int64(99)
memory usage: 46.7 KB
target
0    57
1     2
Name: count, dtype: int64
