In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score, 
    roc_curve, 
    roc_auc_score
)

In [3]:
def clean_and_preprocess_data(df):
    """
    Comprehensive data cleaning and preprocessing function
    
    Parameters:
    -----------
    df : pandas DataFrame
        Raw Lending Club dataset
    
    Returns:
    --------
    tuple: (processed features, target variable)
    """
    # Create a copy to avoid modifying the original DataFrame
    data = df.copy()
    
    # Drop columns with excessive missing values (>50%)
    missing_threshold = 0.5
    columns_to_drop = data.columns[data.isnull().mean() > missing_threshold]
    data.drop(columns=columns_to_drop, inplace=True)
    
    # Handle specific columns
    # Convert loan status to binary classification problem
    data['loan_status'] = data['loan_status'].map({
        'Fully Paid': 1,  # Successful loan
        'Charged Off': 0  # Defaulted loan
    })
    
    # Remove rows with missing target variable
    data.dropna(subset=['loan_status'], inplace=True)
    
    # Select relevant features
    features_to_use = [
        'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 
        'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths', 
        'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 
        'total_acc', 'total_pymnt'
    ]
    
    # Create feature matrix
    X = data[features_to_use]
    y = data['loan_status']
    
    # Impute missing values
    imputer = SimpleImputer(strategy='median')
    X_imputed = pd.DataFrame(
        imputer.fit_transform(X), 
        columns=X.columns
    )
    
    # Encode categorical variables if any exist
    categorical_columns = X_imputed.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        le = LabelEncoder()
        X_imputed[col] = le.fit_transform(X_imputed[col].astype(str))
    
    return X_imputed, y


In [4]:

def train_random_forest(X, y):
    """
    Train Random Forest Classifier
    
    Parameters:
    -----------
    X : DataFrame
        Feature matrix
    y : Series
        Target variable
    
    Returns:
    --------
    tuple: (trained model, test data)
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train Random Forest
    rf_classifier = RandomForestClassifier(
        n_estimators=100, 
        random_state=42, 
        n_jobs=-1
    )
    rf_classifier.fit(X_train_scaled, y_train)
    
    return rf_classifier, X_test, y_test, scaler

def evaluate_model(model, X_test, y_test):
    """
    Evaluate Random Forest model
    
    Parameters:
    -----------
    model : RandomForestClassifier
        Trained Random Forest model
    X_test : array-like
        Test feature matrix
    y_test : array-like
        Test target variable
    """
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Metrics
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    
    # ROC Curve and AUC
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    # Visualizations
    plt.figure(figsize=(15, 5))
    
    # Confusion Matrix
    plt.subplot(131)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    
    # Feature Importance
    plt.subplot(132)
    feature_importance = model.feature_importances_
    feature_names = X.columns
    indices = np.argsort(feature_importance)
    plt.title('Feature Importance')
    plt.barh(range(len(indices)), feature_importance[indices])
    plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
    
    # ROC Curve
    plt.subplot(133)
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.2f})')
    plt.plot([0, 1], [0, 1], linestyle='--', label='Random Classifier')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
    
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'auc': auc
    }


In [6]:

# Main Execution
def main():
    # Load the data (adjust path as needed)
    df = pd.read_csv(
        'C:/Users/NANAYAW/OneDrive/Documents/GitHub/FinalProject/LendingClub_2007_to_2018.csv',
        low_memory=False
    )
    

In [8]:

# Main Execution
def main():
    # Load the data (adjust path as needed)
    df = pd.read_csv(
        'C:/Users/NANAYAW/OneDrive/Documents/GitHub/FinalProject/LendingClub_2007_to_2018.csv',
        low_memory=False
    )
    
    # Clean and preprocess data
    X, y = clean_and_preprocess_data(df)
    
    # Train the model
    model, X_test, y_test, scaler = train_random_forest(X, y)
    
    # Evaluate the model
    results = evaluate_model(model, scaler.transform(X_test), y_test)
    
    print("\nModel Performance Summary:")
    print(f"Accuracy: {results['accuracy']:.2%}")
    print(f"AUC Score: {results['auc']:.4f}")

# Run the main function
if __name__ == '__main__':
    main()

PermissionError: [Errno 13] Permission denied