# P2P Lending Data Analysis
This notebook performs EDA (Exploratory Data Analysis) on a cleaned dataset from P2P lending.

In [None]:
import os
import warnings
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

warnings.filterwarnings("ignore")
plt.style.use("default")
sns.set_palette("husl")

# Set up the correct figures directory
FIGURES_DIR = "figures"
os.makedirs(FIGURES_DIR, exist_ok=True)

In [8]:
# 1. Load Data
def load_data():
    df = pd.read_csv("../../../1_datasets/processed_data/p2p_df_final_cleaned_1000.csv")
    return df

def overview(df):
    print("\n--- Data Overview ---\n")
    print(df.info())
    print("\n--- Descriptive Statistics ---\n")
    print(df.describe(include="all"))
    print("\n--- Null Values ---\n")
    print(df.isnull().sum())
    df.describe().to_csv(os.path.join(FIGURES_DIR, "p2p_descriptive_stats.csv"))

In [4]:
def plot_numeric_distributions(df):
    numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
    for col in numeric_cols:
        plt.figure(figsize=(8, 4))
        sns.histplot(df[col], kde=True, bins=30)
        plt.title(f"Distribution of {col}")
        plt.tight_layout()
        plt.savefig(os.path.join(FIGURES_DIR, f"{col}_hist.png"), dpi=150)
        plt.close()
        plt.figure(figsize=(6, 4))
        sns.boxplot(x=df[col])
        plt.title(f"Boxplot of {col}")
        plt.tight_layout()
        plt.savefig(os.path.join(FIGURES_DIR, f"{col}_box.png"), dpi=150)
        plt.close()

In [5]:
def plot_categorical_bars(df):
    cat_cols = [
        "grade_B", "grade_C", "grade_D", "grade_E", "grade_F", "grade_G",
        "home_ownership_other", "home_ownership_own", "home_ownership_rent",
        "verification_status_Source Verified", "verification_status_Verified",
        "initial_list_status_w", "application_type_Joint App", "loan_status_binary",
    ]
    cat_cols += [col for col in df.columns if col.startswith("purpose_")]
    for col in cat_cols:
        if col in df.columns:
            plt.figure(figsize=(6, 4))
            sns.countplot(x=df[col])
            plt.title(f"Count of {col}")
            plt.tight_layout()
            plt.savefig(os.path.join(FIGURES_DIR, f"{col}_bar.png"), dpi=150)
            plt.close()

In [6]:
def plot_correlation_heatmap(df):
    numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
    corr = df[numeric_cols].corr()
    plt.figure(figsize=(14, 10))
    sns.heatmap(corr, annot=False, cmap="RdYlBu_r", center=0)
    plt.title("Correlation Heatmap (Numeric Features)")
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, "correlation_heatmap.png"), dpi=200)
    plt.close()

In [9]:
# Final block to run everything
df = load_data()
overview(df)
plot_numeric_distributions(df)
plot_categorical_bars(df)
plot_correlation_heatmap(df)

print(f"\nAll figures and summary statistics saved to: {FIGURES_DIR}")


--- Data Overview ---

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 46 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   loan_amnt                            1000 non-null   int64  
 1   term                                 1000 non-null   int64  
 2   int_rate                             1000 non-null   float64
 3   installment                          1000 non-null   float64
 4   emp_length                           1000 non-null   int64  
 5   annual_inc                           1000 non-null   float64
 6   dti                                  1000 non-null   float64
 7   delinq_2yrs                          1000 non-null   int64  
 8   fico_range_low                       1000 non-null   int64  
 9   fico_range_high                      1000 non-null   int64  
 10  inq_last_6mths                       1000 non-null   int64  
 11  open_ac