In [1]:
import pandas as pd

df = pd.read_csv("lendingclub_clean.csv")
print(df.head())
print(df.columns.tolist())

   loan_amnt  funded_amnt  funded_amnt_inv        term  int_rate  installment  \
0       3500         3500           3500.0   36 months     13.56       113.95   
1      30000        30000          30000.0   60 months     18.94       777.23   
2       5000         5000           5000.0   36 months     17.97       180.69   
3       4000         4000           4000.0   36 months     18.94       146.51   
4      30000        30000          30000.0   60 months     16.14       731.78   

  grade home_ownership  annual_inc verification_status    dti emp_length  \
0     C           RENT     55000.0        Not Verified  18.24  10+ years   
1     D       MORTGAGE     90000.0     Source Verified  26.52  10+ years   
2     D       MORTGAGE     59280.0     Source Verified  10.51    6 years   
3     D       MORTGAGE     92000.0     Source Verified  16.74  10+ years   
4     C       MORTGAGE     57250.0        Not Verified  26.35  10+ years   

  loan_status addr_state             purpose   issue_d  

Menambahkan kolom baru: 
addr_state_category yang merupakan hasil pengelompokkan addr_state berdasarkan 5 region yaitu west, south_west, 
south_east, mid_west, dan north_east. 
loan_condition berdasarkan loan_status. Apabila loan_status "bad loan" jika Charged Off, Default, Does 
not meet the credit policy. Status : Charged Off, In Grace Period, Late (16-30 days), dan Late (31-120 days). Sementara untuk lainnya, loan_status "good loan". Untuk good loan bernilai 0 dan bad loan berniali 1
inc_category berdasarkan annual_inc dengan ketentuan sebagai berikut : high > 200.000, medium antara 100.000 - 200.000, low <100.000.
emp_length_int berdasarkan emp_length yang diubah menjadi integer.
int_payments yang merupakan hasil pengkategorian int_rate dengan ketentuan high >13,23 dan low <13,23.
issue_d diubah menjadi tahun dengan format bulan-tahun.

addr_state_category

In [2]:
selected_columns = [
    'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 
    'int_rate', 'installment', 'grade', 'home_ownership', 
    'annual_inc', 'verification_status', 'dti', 'emp_length', 
    'loan_status', 'addr_state', 'purpose', 'issue_d'
]
df_new = df[selected_columns].copy()

region_map = {
    # West
    'CA':'west','OR':'west','WA':'west','NV':'west','ID':'west','MT':'west','WY':'west',
    'CO':'west','UT':'west','AK':'west','HI':'west',

    # South West
    'TX':'south_west','OK':'south_west','NM':'south_west','AZ':'south_west',

    # South East
    'FL':'south_east','GA':'south_east','SC':'south_east','NC':'south_east','VA':'south_east',
    'WV':'south_east','AL':'south_east','MS':'south_east','TN':'south_east','KY':'south_east',
    'AR':'south_east','LA':'south_east',

    # Mid West
    'ND':'mid_west','SD':'mid_west','NE':'mid_west','KS':'mid_west','MN':'mid_west',
    'IA':'mid_west','MO':'mid_west','WI':'mid_west','IL':'mid_west','IN':'mid_west',
    'MI':'mid_west','OH':'mid_west',

    # North East
    'NY':'north_east','NJ':'north_east','PA':'north_east','RI':'north_east','MA':'north_east',
    'VT':'north_east','NH':'north_east','ME':'north_east','CT':'north_east','DE':'north_east',
    'MD':'north_east','DC':'north_east'
}

df_new['addr_state_category'] = df_new['addr_state'].map(region_map)


loan_condition

In [3]:
bad_status = [
    "Charged Off",
    "Default",
    "Does not meet the credit policy. Status:Charged Off", 
    "In Grace Period",
    "Late (16-30 days)",
    "Late (31-120 days)"
]

df_new['loan_condition'] = df_new['loan_status'].apply(lambda x: 1 if x in bad_status else 0)


inc_category

In [4]:
def categorize_income(x):
    if x > 200000:
        return "high"
    elif x >= 100000:
        return "medium"
    else:
        return "low"

df_new['inc_category'] = df_new['annual_inc'].apply(categorize_income)

emp_length_int

In [5]:
import numpy as np

def emp_length_to_int(x):
    if pd.isna(x):
        return np.nan
    x = str(x).lower()
    if x == '10+ years':
        return 10
    elif x == '< 1 year':
        return 0
    elif x == 'n/a':
        return np.nan
    else:
        return int(x.split()[0])  # ambil angka depan

df_new['emp_length_int'] = df_new['emp_length'].apply(emp_length_to_int)


int_payments

In [6]:
# ubah string persen ke float
df_new['int_rate'] = pd.to_numeric(df_new['int_rate'], errors='coerce')

df_new['int_payments'] = df_new['int_rate'].apply(lambda x: 'high' if x > 13.23 else 'low')


issue_d â†’ Tahun saja

In [7]:
df_new['issue_d'] = pd.to_datetime(df_new['issue_d'], format='%b-%Y')
df_new['issue_year'] = df_new['issue_d'].dt.year

print(df_new.head())

   loan_amnt  funded_amnt  funded_amnt_inv        term  int_rate  installment  \
0       3500         3500           3500.0   36 months     13.56       113.95   
1      30000        30000          30000.0   60 months     18.94       777.23   
2       5000         5000           5000.0   36 months     17.97       180.69   
3       4000         4000           4000.0   36 months     18.94       146.51   
4      30000        30000          30000.0   60 months     16.14       731.78   

  grade home_ownership  annual_inc verification_status  ...  loan_status  \
0     C           RENT     55000.0        Not Verified  ...      Current   
1     D       MORTGAGE     90000.0     Source Verified  ...      Current   
2     D       MORTGAGE     59280.0     Source Verified  ...      Current   
3     D       MORTGAGE     92000.0     Source Verified  ...      Current   
4     C       MORTGAGE     57250.0        Not Verified  ...      Current   

  addr_state             purpose    issue_d addr_state_c

In [8]:
cek = (df_new['annual_inc'] > 200000).any()

print("Apakah ada nilai di atas 200000? :", cek)

Apakah ada nilai di atas 200000? : False


In [9]:
df_new.to_csv("lendingclub_feature engineer.csv", index=False)

Cek data baru

In [10]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2113644 entries, 0 to 2113643
Data columns (total 22 columns):
 #   Column               Dtype         
---  ------               -----         
 0   loan_amnt            int64         
 1   funded_amnt          int64         
 2   funded_amnt_inv      float64       
 3   term                 object        
 4   int_rate             float64       
 5   installment          float64       
 6   grade                object        
 7   home_ownership       object        
 8   annual_inc           float64       
 9   verification_status  object        
 10  dti                  float64       
 11  emp_length           object        
 12  loan_status          object        
 13  addr_state           object        
 14  purpose              object        
 15  issue_d              datetime64[ns]
 16  addr_state_category  object        
 17  loan_condition       int64         
 18  inc_category         object        
 19  emp_length_int       