In [6]:
import pandas as pd
import os

DATA_PATH = r"C:\Users\Muskan\OneDrive\Documents\credit.csv" 

if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"File not found: {DATA_PATH}")

df = pd.read_csv(DATA_PATH)
print("Dataset loaded. Shape:", df.shape)
df.head()


Dataset loaded. Shape: (32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [7]:
# Drop columns with >60% missing values
threshold = 0.6
cols_to_drop = df.columns[df.isnull().mean() > threshold]
df = df.drop(columns=cols_to_drop)

# Fill remaining missing values
for col in df.columns:
    if df[col].isnull().sum() > 0:
        if df[col].dtype in ['int64', 'float64']:
            df[col] = df[col].median()
        else:
            df[col] = df[col].fillna("Unknown")

# Encode target column (loan_status)
df['loan_status'] = df['loan_status'].map({
    'Fully Paid': 0,
    'Charged Off': 1,
    0:0,
    1:1
})

# Feature engineering
if 'person_age' in df.columns:
    bins = [16,25,35,45,55,65,100]
    labels = ['16-24','25-34','35-44','45-54','55-64','65+']
    df['age_group'] = pd.cut(df['person_age'], bins=bins, labels=labels)

if 'person_income' in df.columns:
    df['income_k'] = (df['person_income']/1000).round(1)

# Encode categorical features including age_group
from sklearn.preprocessing import LabelEncoder

cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols = [c for c in cat_cols if c != 'loan_status']  # exclude target

le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le  # save encoder if needed later


In [8]:
from sklearn.model_selection import train_test_split

y = df['loan_status']
X = df.drop(columns=['loan_status'])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (26064, 13) Test shape: (6517, 13)


In [9]:
import sqlite3

conn = sqlite3.connect("credit_risk.db")
df.to_sql('credit_data', conn, if_exists='replace', index=False)
conn.commit()
conn.close()


In [5]:
import os

# Set save path in your Documents folder
save_path = os.path.expanduser("~/Documents/clean_credit_data.csv")

# Save the cleaned dataframe
df.to_csv(save_path, index=False)

print("Cleaned dataset saved at:", save_path)


Cleaned dataset saved at: C:\Users\Muskan/Documents/clean_credit_data.csv


In [10]:
credit_risk = df

In [11]:
df

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,age_group,income_k
0,22,59000,3,4.0,4,3,35000,10.99,1,0.59,1,3,16-24,59.0
1,21,9600,2,4.0,1,1,1000,10.99,0,0.10,0,2,16-24,9.6
2,25,9600,0,4.0,3,2,5500,10.99,1,0.57,0,3,16-24,9.6
3,23,65500,3,4.0,3,2,35000,10.99,1,0.53,0,2,16-24,65.5
4,24,54400,3,4.0,3,2,35000,10.99,1,0.55,1,4,16-24,54.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,0,4.0,4,2,5800,10.99,0,0.11,0,30,55-64,53.0
32577,54,120000,0,4.0,4,0,17625,10.99,0,0.15,0,19,45-54,120.0
32578,65,76000,3,4.0,2,1,35000,10.99,1,0.46,0,28,55-64,76.0
32579,56,150000,0,4.0,4,1,15000,10.99,0,0.10,0,26,55-64,150.0


In [13]:
credit_risk.to_csv("credit_risk.csv", index=False)
print("credit_risk.csv saved successfully!")

credit_risk.csv saved successfully!
