## Libararies

In [2]:
import os
import pandas as pd

# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from xgboost import XGBClassifier
# from imblearn.over_sampling import SMOTE
# from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc

## Load Dataset

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("wordsforthewise/lending-club")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /Users/madihamalik/.cache/kagglehub/datasets/wordsforthewise/lending-club/versions/3


In [4]:
# Use correct file path and filename
filepath = os.path.join(path, "accepted_2007_to_2018Q4.csv.gz")

df = pd.read_csv(filepath, low_memory=False)
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,...,,,Cash,N,,,,,,


## Filter out relevent columns for our research question

In [5]:
relevant_columns = [
    # Loan Characteristics
    "loan_amnt",
    "term",
    "int_rate",
    "grade",
    "purpose",
    # Borrower's Financials & History
    "annual_inc",
    "dti",
    "delinq_2yrs",
    "inq_last_6mths",
    "home_ownership",
    "emp_length",
    "fico_range_low",
    "fico_range_high",
    "issue_d",
    "earliest_cr_line",
    "open_acc",
    "pub_rec",
    "revol_bal",
    "revol_util",
    "total_acc",
    "verification_status",
    "application_type",
    "addr_state",
    # Target Variable
    "loan_status",
]
df = df[relevant_columns]
df.head()

Unnamed: 0,loan_amnt,term,int_rate,grade,purpose,annual_inc,dti,delinq_2yrs,inq_last_6mths,home_ownership,...,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,total_acc,verification_status,application_type,addr_state,loan_status
0,3600.0,36 months,13.99,C,debt_consolidation,55000.0,5.91,0.0,1.0,MORTGAGE,...,Aug-2003,7.0,0.0,2765.0,29.7,13.0,Not Verified,Individual,PA,Fully Paid
1,24700.0,36 months,11.99,C,small_business,65000.0,16.06,1.0,4.0,MORTGAGE,...,Dec-1999,22.0,0.0,21470.0,19.2,38.0,Not Verified,Individual,SD,Fully Paid
2,20000.0,60 months,10.78,B,home_improvement,63000.0,10.78,0.0,0.0,MORTGAGE,...,Aug-2000,6.0,0.0,7869.0,56.2,18.0,Not Verified,Joint App,IL,Fully Paid
3,35000.0,60 months,14.85,C,debt_consolidation,110000.0,17.06,0.0,0.0,MORTGAGE,...,Sep-2008,13.0,0.0,7802.0,11.6,17.0,Source Verified,Individual,NJ,Current
4,10400.0,60 months,22.45,F,major_purchase,104433.0,25.37,1.0,3.0,MORTGAGE,...,Jun-1998,12.0,0.0,21929.0,64.5,35.0,Source Verified,Individual,PA,Fully Paid


### remove leading and trailing whitespace

In [6]:
df = df.apply(lambda col: col.str.strip() if col.dtypes == "object" else col)


## Target Variables

In [7]:
df["loan_status"].value_counts()

loan_status
Fully Paid                                             1076751
Current                                                 878317
Charged Off                                             268559
Late (31-120 days)                                       21467
In Grace Period                                           8436
Late (16-30 days)                                         4349
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                     40
Name: count, dtype: int64

## Create binary target from loan_status

In [8]:
# Define which loan statuses mean the borrower defaulted (bad outcomes)
bad_statuses = ["Charged Off", "Default"]
df["is_default"] = df["loan_status"].apply(lambda x: 1 if x in bad_statuses else 0)

df.sample(5)


Unnamed: 0,loan_amnt,term,int_rate,grade,purpose,annual_inc,dti,delinq_2yrs,inq_last_6mths,home_ownership,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,verification_status,application_type,addr_state,loan_status,is_default
131951,16000.0,36 months,12.69,C,debt_consolidation,51000.0,11.69,0.0,1.0,MORTGAGE,...,7.0,0.0,18544.0,52.2,12.0,Verified,Individual,CO,Fully Paid,0
498563,16000.0,36 months,21.45,D,credit_card,60500.0,36.6,0.0,1.0,RENT,...,12.0,1.0,23865.0,89.0,38.0,Not Verified,Individual,MO,Late (16-30 days),0
123673,5000.0,36 months,6.24,A,debt_consolidation,60000.0,5.48,0.0,0.0,RENT,...,9.0,0.0,8225.0,43.3,22.0,Not Verified,Individual,MO,Fully Paid,0
976824,16200.0,36 months,6.99,A,debt_consolidation,75000.0,22.1,0.0,0.0,MORTGAGE,...,14.0,0.0,16164.0,30.7,38.0,Not Verified,Individual,GA,Current,0
1102026,12000.0,36 months,11.48,B,debt_consolidation,78000.0,24.98,0.0,0.0,MORTGAGE,...,19.0,1.0,12666.0,27.0,26.0,Verified,Individual,CA,Fully Paid,0


## Create binary target from verification_status

In [9]:
# Define which statuses count as verified
verified_statuses = ["Verified", "Source Verified"]

# Create a new column 'is_verified': 1 if verified, 0 if not verified
df["is_verified"] = df["verification_status"].apply(
    lambda x: 1 if x in verified_statuses else 0
)
# Drop the Original Column:
df = df.drop("verification_status", axis=1)

df.sample(10)

Unnamed: 0,loan_amnt,term,int_rate,grade,purpose,annual_inc,dti,delinq_2yrs,inq_last_6mths,home_ownership,...,open_acc,pub_rec,revol_bal,revol_util,total_acc,application_type,addr_state,loan_status,is_default,is_verified
1501250,6000.0,36 months,7.46,A,credit_card,55000.0,18.48,0.0,2.0,RENT,...,8.0,0.0,5463.0,10.3,24.0,Individual,OR,Fully Paid,0,0
504471,10000.0,36 months,6.72,A,credit_card,240000.0,7.49,0.0,2.0,MORTGAGE,...,11.0,1.0,27540.0,58.0,20.0,Individual,NJ,Current,0,1
2138929,16800.0,60 months,21.45,D,debt_consolidation,73000.0,16.9,0.0,0.0,MORTGAGE,...,12.0,1.0,10233.0,92.2,18.0,Individual,NY,Current,0,0
65158,10000.0,36 months,13.18,C,credit_card,30000.0,34.2,0.0,0.0,RENT,...,10.0,0.0,6677.0,92.7,24.0,Individual,GA,Fully Paid,0,1
2059514,15000.0,60 months,19.03,D,credit_card,44152.2,34.93,0.0,0.0,RENT,...,9.0,0.0,21965.0,75.7,15.0,Individual,NJ,Current,0,1
1839469,20000.0,36 months,7.62,A,home_improvement,75000.0,12.25,1.0,1.0,MORTGAGE,...,22.0,0.0,9960.0,15.3,43.0,Individual,CA,Fully Paid,0,1
1426404,3025.0,36 months,10.47,B,other,35000.0,27.64,0.0,1.0,ANY,...,9.0,0.0,2650.0,10.4,14.0,Individual,GA,Current,0,0
1799931,12000.0,36 months,16.78,C,debt_consolidation,75600.0,12.63,0.0,0.0,RENT,...,5.0,0.0,8773.0,98.6,9.0,Individual,NM,Fully Paid,0,1
1201524,30000.0,60 months,18.99,E,debt_consolidation,110000.0,6.57,0.0,0.0,MORTGAGE,...,9.0,0.0,13579.0,64.7,24.0,Individual,CA,Fully Paid,0,1
1125039,12000.0,36 months,11.44,B,major_purchase,55000.0,13.86,5.0,0.0,OWN,...,9.0,0.0,7853.0,73.4,14.0,Individual,MI,Fully Paid,0,1


##  Filter for loans with known outcomes only

In [10]:
# Filter for loans with known outcomes only
resolved_statuses = ["Fully Paid", "Charged Off", "Default"]
df = df[df["loan_status"].isin(resolved_statuses)].copy()


## Drop columns that are no longer needed

In [11]:
# Drop columns that are no longer needed (safe to skip if already gone)
df = df.drop("loan_status", axis=1, errors="ignore")
df = df.drop("verification_status", axis=1, errors="ignore")

df.sample(6)

Unnamed: 0,loan_amnt,term,int_rate,grade,purpose,annual_inc,dti,delinq_2yrs,inq_last_6mths,home_ownership,...,earliest_cr_line,open_acc,pub_rec,revol_bal,revol_util,total_acc,application_type,addr_state,is_default,is_verified
392114,5000.0,36 months,7.49,A,debt_consolidation,160000.0,11.91,0.0,1.0,MORTGAGE,...,Jan-1998,13.0,0.0,2803.0,29.5,34.0,Individual,PA,1,0
1988702,35000.0,36 months,25.29,E,debt_consolidation,144000.0,12.61,0.0,3.0,RENT,...,Jul-2005,25.0,1.0,13965.0,34.6,42.0,Individual,MA,1,1
1804497,8000.0,36 months,10.64,B,home_improvement,37200.0,5.0,0.0,0.0,OWN,...,Nov-2004,6.0,0.0,8257.0,37.9,11.0,Individual,FL,0,1
7269,16000.0,36 months,6.24,A,debt_consolidation,87000.0,11.42,0.0,0.0,RENT,...,Jun-2006,8.0,0.0,5302.0,18.9,10.0,Individual,CA,0,0
39569,11200.0,36 months,14.48,C,debt_consolidation,28000.0,30.78,1.0,1.0,MORTGAGE,...,Feb-2004,10.0,1.0,2581.0,17.3,22.0,Individual,NY,0,0
1080126,13800.0,60 months,15.31,C,credit_card,62000.0,19.34,0.0,0.0,RENT,...,Apr-2006,16.0,0.0,18644.0,88.4,29.0,Individual,NV,0,0


In [12]:
# Step 1: Keep only rows with valid categories
if "home_ownership" in df.columns:
    df = df[df["home_ownership"].isin(["MORTGAGE", "RENT", "OWN"])]

    # Step 2: One-hot encode and force 0/1 integers
    home_dummies = pd.get_dummies(
        df["home_ownership"], prefix="home", drop_first=False
    ).astype(int)

    # Step 3: Drop original column and add the new dummy columns
    df = df.drop("home_ownership", axis=1)
    df = pd.concat([df, home_dummies], axis=1)

# Step 4: Find and convert any existing dummy columns that are still boolean
dummy_cols = [col for col in df.columns if col.startswith("home_")]
df[dummy_cols] = df[dummy_cols].astype(int)

# Step 5: Drop any weird ones if they still exist
df = df.drop(
    columns=[
        col
        for col in [
            "home_NONE",
            "home_OTHER",
            "home_ownership_NONE",
            "home_ownership_OTHER",
        ]
        if col in df.columns
    ]
)
df.sample(6)

Unnamed: 0,loan_amnt,term,int_rate,grade,purpose,annual_inc,dti,delinq_2yrs,inq_last_6mths,emp_length,...,revol_bal,revol_util,total_acc,application_type,addr_state,is_default,is_verified,home_MORTGAGE,home_OWN,home_RENT
300402,12650.0,36 months,5.93,A,major_purchase,134000.0,7.88,0.0,2.0,10+ years,...,9792.0,9.1,25.0,Individual,WA,0,1,1,0,0
2168654,14000.0,36 months,11.44,B,debt_consolidation,76000.0,20.67,5.0,0.0,5 years,...,389.0,77.8,22.0,Individual,VT,0,1,1,0,0
779462,20000.0,36 months,7.21,A,credit_card,65000.0,17.06,0.0,0.0,3 years,...,8889.0,27.7,15.0,Individual,VA,0,0,0,0,1
1105759,30000.0,60 months,13.99,C,credit_card,100000.0,11.58,1.0,0.0,6 years,...,7849.0,14.0,41.0,Individual,VA,0,1,0,0,1
2256053,5325.0,36 months,18.99,D,small_business,30000.0,4.44,0.0,2.0,6 years,...,1166.0,35.3,7.0,Individual,AZ,1,1,0,0,1
1237868,15000.0,36 months,11.67,B,debt_consolidation,65000.0,24.93,0.0,2.0,7 years,...,20563.0,43.6,28.0,Individual,AL,0,1,1,0,0


## application_type
Values: Mostly "Individual" or "Joint App".
Since this column has only two categories, we can use binary encoding (0 and 1):

In [13]:
# Map "Individual" to 0 and "Joint App" to 1
df["application_type"] = df["application_type"].map({"Individual": 0, "Joint App": 1})
df.sample(6)

Unnamed: 0,loan_amnt,term,int_rate,grade,purpose,annual_inc,dti,delinq_2yrs,inq_last_6mths,emp_length,...,revol_bal,revol_util,total_acc,application_type,addr_state,is_default,is_verified,home_MORTGAGE,home_OWN,home_RENT
893455,1000.0,36 months,16.02,C,other,30000.0,9.0,0.0,0.0,< 1 year,...,1482.0,21.9,4.0,0,CA,0,0,0,0,1
1174266,9175.0,36 months,9.17,B,credit_card,24336.0,32.64,0.0,0.0,< 1 year,...,7161.0,55.9,9.0,0,AL,1,1,0,0,1
2023670,20000.0,36 months,10.49,B,credit_card,65000.0,26.53,0.0,1.0,,...,58841.0,84.5,25.0,0,RI,0,0,0,1,0
1827012,6000.0,36 months,9.71,B,credit_card,46000.0,10.28,1.0,0.0,8 years,...,10385.0,43.3,26.0,0,SC,0,1,1,0,0
1855820,12000.0,36 months,10.16,B,debt_consolidation,80000.0,12.43,0.0,0.0,< 1 year,...,9812.0,53.6,15.0,0,NC,0,0,1,0,0
286811,12000.0,60 months,12.69,C,other,62000.0,16.03,0.0,0.0,9 years,...,23696.0,94.4,29.0,0,VA,0,1,1,0,0


### One-Hot Encode Top Categories in purpose

Keep the top 5 most frequent categories and group the rest as "other".

In [14]:
# Step 1: Get top 5 most frequent purposes
top_purposes = df["purpose"].value_counts().nlargest(5).index.tolist()

# Step 2: Replace rare ones with "other"
df["purpose"] = df["purpose"].apply(lambda x: x if x in top_purposes else "other")

# Step 3: One-hot encode with clear column names
purpose_dummies = pd.get_dummies(df["purpose"], prefix="purpose").astype(int)

# Example of how column names will look:
# purpose_debt_consolidation, purpose_small_business, purpose_home_improvement, etc.

# Step 4: Drop original and concat new columns
df = df.drop("purpose", axis=1)
df = pd.concat([df, purpose_dummies], axis=1)

df.sample(6)


Unnamed: 0,loan_amnt,term,int_rate,grade,annual_inc,dti,delinq_2yrs,inq_last_6mths,emp_length,fico_range_low,...,is_default,is_verified,home_MORTGAGE,home_OWN,home_RENT,purpose_credit_card,purpose_debt_consolidation,purpose_home_improvement,purpose_major_purchase,purpose_other
1678263,8000.0,36 months,11.39,B,62000.0,9.95,0.0,2.0,10+ years,660.0,...,0,1,0,0,1,0,1,0,0,0
795189,20000.0,36 months,13.56,C,72000.0,20.4,0.0,1.0,10+ years,685.0,...,0,1,1,0,0,0,1,0,0,0
150221,5000.0,36 months,9.17,B,75000.0,11.55,1.0,0.0,10+ years,680.0,...,0,1,0,0,1,0,1,0,0,0
1813889,9750.0,36 months,10.64,B,30000.0,10.21,1.0,0.0,10+ years,670.0,...,0,0,1,0,0,1,0,0,0,0
1778234,35000.0,36 months,10.99,B,80000.0,5.3,0.0,4.0,5 years,800.0,...,0,1,1,0,0,0,0,1,0,0
201075,15000.0,36 months,16.99,D,55000.0,8.58,0.0,0.0,10+ years,680.0,...,0,1,1,0,0,0,1,0,0,0


### Drop it addr_state column

In [15]:
df = df.drop("addr_state", axis=1)
df.sample(5)


Unnamed: 0,loan_amnt,term,int_rate,grade,annual_inc,dti,delinq_2yrs,inq_last_6mths,emp_length,fico_range_low,...,is_default,is_verified,home_MORTGAGE,home_OWN,home_RENT,purpose_credit_card,purpose_debt_consolidation,purpose_home_improvement,purpose_major_purchase,purpose_other
409361,20000.0,36 months,6.49,A,210000.0,10.61,1.0,0.0,2 years,715.0,...,0,1,1,0,0,1,0,0,0,0
90804,10000.0,36 months,11.53,B,37000.0,8.86,0.0,2.0,,660.0,...,0,1,0,0,1,0,1,0,0,0
1077797,14000.0,60 months,14.46,C,40000.0,20.25,1.0,3.0,6 years,670.0,...,1,1,0,0,1,0,1,0,0,0
1182606,19000.0,60 months,8.39,A,72000.0,20.48,1.0,0.0,6 years,715.0,...,0,1,1,0,0,1,0,0,0,0
747172,11875.0,60 months,21.97,E,45000.0,16.4,0.0,0.0,10+ years,670.0,...,0,0,1,0,0,0,1,0,0,0


### 🎯 Encode `grade` Feature
The `grade` column represents loan quality (A–G). map it to integers if you believe higher grades imply better creditworthiness.


In [16]:
# Safely one-hot encode "grade" with 0/1 values (if it exists)
if "grade" in df.columns:
    # One-hot encode and force int (0/1) instead of bool
    grade_dummies = pd.get_dummies(
        df["grade"], prefix="grade", drop_first=False
    ).astype(int)

    # Drop the original "grade" column
    df = df.drop("grade", axis=1)

    # Concatenate the new one-hot columns to the DataFrame
    df = pd.concat([df, grade_dummies], axis=1)


df.sample(6)


Unnamed: 0,loan_amnt,term,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,emp_length,fico_range_low,fico_range_high,...,purpose_home_improvement,purpose_major_purchase,purpose_other,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G
624437,34250.0,36 months,30.75,185640.0,9.62,0.0,0.0,9 years,700.0,704.0,...,0,0,0,0,0,0,0,0,1,0
1214706,6000.0,36 months,7.69,36000.0,29.19,0.0,0.0,1 year,720.0,724.0,...,0,0,0,1,0,0,0,0,0,0
1485466,15300.0,36 months,22.9,129500.0,13.44,0.0,0.0,1 year,720.0,724.0,...,0,0,0,0,0,0,0,1,0,0
1727933,15000.0,36 months,7.99,65000.0,9.53,6.0,0.0,10+ years,700.0,704.0,...,0,1,0,1,0,0,0,0,0,0
1041033,20000.0,36 months,13.67,50000.0,9.99,0.0,0.0,3 years,660.0,664.0,...,0,0,1,0,0,1,0,0,0,0
106189,23000.0,60 months,12.69,72000.0,9.85,0.0,0.0,7 years,690.0,694.0,...,0,0,0,0,0,1,0,0,0,0


### Clean term column ("36 months", "60 months")

In [17]:
# Extract numeric part and convert to integer
# Clean the 'term' column: from "36 months" -> 36 (int)
df["term"] = df["term"].astype(str).str.extract(r"(\d+)").astype(int)

df.sample(5)


Unnamed: 0,loan_amnt,term,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,emp_length,fico_range_low,fico_range_high,...,purpose_home_improvement,purpose_major_purchase,purpose_other,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G
407725,20000.0,60,18.54,60000.0,6.02,0.0,0.0,< 1 year,675.0,679.0,...,0,0,0,0,0,0,0,1,0,0
1122784,5000.0,36,13.66,120000.0,5.21,0.0,0.0,6 years,705.0,709.0,...,0,0,1,0,0,1,0,0,0,0
1129880,24000.0,60,12.99,120000.0,12.22,0.0,2.0,7 years,785.0,789.0,...,0,0,0,0,0,1,0,0,0,0
1239140,9000.0,36,6.03,145000.0,15.42,0.0,0.0,10+ years,755.0,759.0,...,0,0,0,1,0,0,0,0,0,0
1229284,16750.0,60,16.99,42000.0,23.54,0.0,2.0,10+ years,660.0,664.0,...,0,0,0,0,0,0,1,0,0,0


### Clean emp_length column ("10+ years", "< 1 year", "n/a")

In [18]:
def clean_emp_length(val):
    val = str(val).lower()  # convert everything to lowercase string
    if "< 1" in val:
        return 0
    elif "10+" in val:
        return 10
    elif "n/a" in val:
        return -1
    else:
        num = "".join([c for c in val if c.isdigit()])
        return int(num) if num else -1


# Apply the function
df["emp_length"] = df["emp_length"].apply(clean_emp_length).astype(int)

df.head(6)


Unnamed: 0,loan_amnt,term,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,emp_length,fico_range_low,fico_range_high,...,purpose_home_improvement,purpose_major_purchase,purpose_other,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G
0,3600.0,36,13.99,55000.0,5.91,0.0,1.0,10,675.0,679.0,...,0,0,0,0,0,1,0,0,0,0
1,24700.0,36,11.99,65000.0,16.06,1.0,4.0,10,715.0,719.0,...,0,0,1,0,0,1,0,0,0,0
2,20000.0,60,10.78,63000.0,10.78,0.0,0.0,10,695.0,699.0,...,1,0,0,0,1,0,0,0,0,0
4,10400.0,60,22.45,104433.0,25.37,1.0,3.0,3,695.0,699.0,...,0,1,0,0,0,0,0,0,1,0
5,11950.0,36,13.44,34000.0,10.2,0.0,0.0,4,690.0,694.0,...,0,0,0,0,0,1,0,0,0,0
6,20000.0,36,9.17,180000.0,14.67,0.0,0.0,10,680.0,684.0,...,0,0,0,0,1,0,0,0,0,0


In [19]:
missing_counts = df.isnull().sum()
print(missing_counts[missing_counts > 0])


dti               374
inq_last_6mths      1
revol_util        854
dtype: int64


In [20]:
print(df["is_default"].value_counts(normalize=True))


is_default
0    0.800346
1    0.199654
Name: proportion, dtype: float64


In [21]:
df.select_dtypes(include=["object"]).head()

Unnamed: 0,issue_d,earliest_cr_line
0,Dec-2015,Aug-2003
1,Dec-2015,Dec-1999
2,Dec-2015,Aug-2000
4,Dec-2015,Jun-1998
5,Dec-2015,Oct-1987


**Convert dates to date-time type**

In [22]:
# 4. Engineer credit history length
# Check if the original date columns exist
if "issue_d" in df.columns and "earliest_cr_line" in df.columns:
    print("Engineering 'credit_history_length'...")
    # Ensure columns are datetime before performing operations
    df["issue_d"] = pd.to_datetime(df["issue_d"])
    df["earliest_cr_line"] = pd.to_datetime(df["earliest_cr_line"])
    df["credit_history_length"] = (
        (df["issue_d"] - df["earliest_cr_line"]).dt.days / 30
    ).round(0)
    # Drop original date columns now that we're done with them
    df = df.drop(columns=["issue_d", "earliest_cr_line"], errors="ignore")

df.head()


Engineering 'credit_history_length'...


  df["issue_d"] = pd.to_datetime(df["issue_d"])
  df["earliest_cr_line"] = pd.to_datetime(df["earliest_cr_line"])


Unnamed: 0,loan_amnt,term,int_rate,annual_inc,dti,delinq_2yrs,inq_last_6mths,emp_length,fico_range_low,fico_range_high,...,purpose_major_purchase,purpose_other,grade_A,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,credit_history_length
0,3600.0,36,13.99,55000.0,5.91,0.0,1.0,10,675.0,679.0,...,0,0,0,0,1,0,0,0,0,150.0
1,24700.0,36,11.99,65000.0,16.06,1.0,4.0,10,715.0,719.0,...,0,1,0,0,1,0,0,0,0,195.0
2,20000.0,60,10.78,63000.0,10.78,0.0,0.0,10,695.0,699.0,...,0,0,0,1,0,0,0,0,0,187.0
4,10400.0,60,22.45,104433.0,25.37,1.0,3.0,3,695.0,699.0,...,1,0,0,0,0,0,0,1,0,213.0
5,11950.0,36,13.44,34000.0,10.2,0.0,0.0,4,690.0,694.0,...,0,0,0,0,1,0,0,0,0,343.0


**Strip leading/tailing white spaces**

In [23]:
df["emp_length"].value_counts()

emp_length
 10    442075
 2     121712
 0     108008
 3     107558
 1      88468
 5      84112
 4      80525
-1      78492
 6      62706
 8      60689
 7      59606
 9      50921
Name: count, dtype: int64

### 📁 Cleaned dataset available here: [Download lending_club_cleaned.csv](https://drive.google.com/drive/folders/1qNO8Zt4Hla22DKdx6A3FxfRT9koHN-0A)
We can't upload it directly due to its size

## Missing Values