In [1]:
# Install required packages
!pip install --upgrade pip -q
!pip install pandas -q
!pip install numpy -q
!pip install scikit-learn -q

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: To modify pip, please run the following command:
C:\Python313\python.exe -m pip install --upgrade pip -q

[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Load dataset
df = pd.read_csv(r"C:\Users\karis\OneDrive\Technocolab\Prosper_Dataset\Prosper_Dataset\prosperLoanData.csv")

In [3]:
# =========================
# 1. Data Cleaning
# =========================

# Filter out Cancelled loans to focus on loans with meaningful repayment outcomes
df = df[df['LoanStatus'] != 'Cancelled']

# Drop columns with more than 40% missing values
missing_threshold = 0.4
df = df.loc[:, df.isnull().mean() < missing_threshold]

# Fill missing values: median for numeric, mode for categorical
for col in df.columns:
    if df[col].dtype in [np.float64, np.int64]:
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna(df[col].mode()[0])

# Remove duplicate rows
df = df.drop_duplicates()

# Standardize column names (lowercase, remove spaces and special characters)
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '', regex=True)

# Convert date columns to datetime
date_cols = [col for col in df.columns if 'date' in col or 'month' in col or 'year' in col]
for col in date_cols:
    try:
        df[col] = pd.to_datetime(df[col])
    except:
        pass

In [4]:
# =========================
# 2. Target Variable Creation
# =========================

# Define risky statuses (conservative approach: all PastDue are risky)
default_statuses = [
    'Chargedoff', 'Defaulted',
    'Past Due (31-60 days)', 'Past Due (61-90 days)',
    'Past Due (91-120 days)', 'Past Due (>120 days)'
]

# Create binary target variable (1: risky, 0: non-risky)
df['loan_status_binary'] = df['loanstatus'].apply(lambda x: 1 if x in default_statuses else 0)

# Drop original loanstatus column
df.drop(columns=['loanstatus'], inplace=True)

In [5]:
# =========================
# 3. Data Encoding
# =========================

# Label encode binary categorical columns
label_encoder = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    if df[col].nunique() == 2:
        df[col] = label_encoder.fit_transform(df[col])

# One-hot encode multi-category columns (with <= 20 unique values)
cat_cols = [col for col in df.select_dtypes(include='object').columns if df[col].nunique() <= 20]
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [6]:
# =========================
# 4. Handling Outliers
# =========================

# Clip outliers for numeric columns at 1st and 99th percentiles
numeric_cols = df.select_dtypes(include=np.number).columns
for col in numeric_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = np.clip(df[col], lower, upper)

In [7]:
# =========================
# 5. Save Cleaned Dataset
# =========================

# Save the processed dataset
df.to_csv("processed_data.csv", index=False)
print("✅ Preprocessing complete. Dataset saved as 'processed_data.csv'")

✅ Preprocessing complete. Dataset saved as 'processed_data.csv'
