### Transformasi dan Rekayasa Fitur untuk Kedua Dataset 
- Mengubah variabel kategorikal menjadi format numerik (misalnya, one-hot 
encoding, ordinal encoding, atau frequency encoding) untuk masing-masing 
dataset. 
- Melakukan normalisasi atau standarisasi fitur numerik jika diperlukan. 
- Menciptakan minimal satu fitur baru yang dapat meningkatkan kualitas dataset. 
- Menyusun laporan tentang dampak transformasi fitur terhadap dataset. 


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
plt.style.use('ggplot')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [None]:
# df_acc = pd.read_csv('accepted_cleaned.csv')
# df_rej = pd.read_csv('rejected_cleaned.csv')
df_tes = pd.read_csv('application_test_cleaned.csv')
df_tra = pd.read_csv('application_train_cleaned.csv') 

In [None]:
df_acc.head().T

In [None]:
print([column for column in df_acc.columns if df_acc[column].dtype == object])

In [None]:
df_acc.term.unique()

In [None]:
term_values = {' 36 months': 36, ' 60 months': 60}
df_acc['term'] = df_acc.term.map(term_values)

In [None]:
df_acc.term.unique()

In [None]:
df_acc.drop(['id','member_id','grade','desc','url'], axis=1, inplace=True)

In [None]:
dummies = [column for column in df_acc.columns if df_acc[column].dtype == object]
df_acc = pd.get_dummies(df_acc, columns=dummies, drop_first=False)

Below for the rejected

In [None]:
print([column for column in df_rej.columns if df_rej[column].dtype == object])


In [None]:
df_rej[''].unique()

break

In [None]:
df_tra.head().T

In [None]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in df_tra:
    if df_tra[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(df_tra[col].unique())) <= 2:
            # Train on the training data
            le.fit(df_tra[col])
            # Transform both training and testing data
            df_tra[col] = le.transform(df_tra[col])
            df_tes[col] = le.transform(df_tes[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

In [None]:
# one-hot encoding of categorical variables
df_tra = pd.get_dummies(df_tra)
df_tes = pd.get_dummies(df_tes)

print('Training Features shape: ', df_tra.shape)
print('Testing Features shape: ', df_tes.shape)

In [None]:
tra_labels = df_tes['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
df_tra, df_tes = df_tra.align(df_tes, join = 'inner', axis = 1)

# Add the target back in
df_tra['TARGET'] = tra_labels

print('Training Features shape: ', df_tra.shape)
print('Testing Features shape: ', df_tes.shape)

In [None]:
for dataset_name in df_tra:
    if isinstance(df_tra[dataset_name], pd.Series):  
        df_tra[dataset_name] = df_tra[dataset_name].to_frame()  # Ubah Series ke DataFrame

In [None]:
minmax_scaler = MinMaxScaler()
standard_scaler = StandardScaler()

for col in df_tra.select_dtypes(include=['int64', 'float64']).columns:
    if df_tra[col].skew().mean() > 1:  # Jika distribusi skewed → Gunakan MinMaxScaler
        df_tra[col] = minmax_scaler.fit_transform(df_tra[[col]])
    else:  # Jika distribusi normal → Gunakan StandardScaler
        df_tra[col] = standard_scaler.fit_transform(df_tra[[col]])

for col in df_tes.select_dtypes(include=['int64', 'float64']).columns:
    if df_tes[col].skew().mean() > 1:  # Jika distribusi skewed → Gunakan MinMaxScaler
        df_tes[col] = minmax_scaler.fit_transform(df_tes[[col]])
    else:  # Jika distribusi normal → Gunakan StandardScaler
        df_tes[col] = standard_scaler.fit_transform(df_tes[[col]])

In [None]:
df_tes.head().T

In [None]:
df_tra.head().T

In [63]:
df_tes.to_csv("app_test_cleaned_encoded.csv", index=False)
df_tra.to_csv("app_train_cleaned_encoded.csv", index=False)