In [2]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
import warnings
warnings.filterwarnings('ignore')

# تحميل البيانات
df = pd.read_csv('Loan_default.csv')

# معالجة البيانات
df = df.drop(["LoanID"], axis=1, errors='ignore')
df["HasMortgage"] = df["HasMortgage"].map({'Yes': 1, 'No': 0})
df["HasDependents"] = df["HasDependents"].map({'Yes': 1, 'No': 0})
df["HasCoSigner"] = df["HasCoSigner"].map({'Yes': 1, 'No': 0})
df["Education"] = df["Education"].str.strip()
edu_map = {'High School': 0, "Bachelor's": 1, "Master's": 2, 'PhD': 3}
df["Education"] = df["Education"].map(edu_map)
df = pd.get_dummies(df, columns=['EmploymentType', 'MaritalStatus', 'LoanPurpose'], dtype=int)

# تقسيم البيانات
inputs = df.drop('Default', axis='columns')
target = df.Default
feature_names = list(inputs.columns)
X_train, X_test, Y_train, Y_test = train_test_split(inputs, target, test_size=0.4, random_state=22)

# تطبيق RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_train_resampled, Y_train_resampled = ros.fit_resample(X_train, Y_train)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# 1. RandomForestClassifier (للدقة العالية)
rf = RandomForestClassifier(n_estimators=50, class_weight='balanced', random_state=42, n_jobs=-1)
rf.fit(X_train_scaled, Y_train_resampled)

# التنبؤ والتقييم لـ RandomForest
y_pred_rf = rf.predict(X_test_scaled)
y_probs_rf = rf.predict_proba(X_test_scaled)[:, 1]

print("RandomForestClassifier (High Accuracy Model)")
print("Classification Report:\n", classification_report(Y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(Y_test, y_pred_rf))
print("AUC-ROC:", roc_auc_score(Y_test, y_probs_rf))
new_features = [
    [56, 85994, 50587, 520, 80, 4, 15.23, 36, 0.44, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1],
    [46, 84208, 129188, 451, 26, 3, 21.17, 24, 0.31, 2, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0]
]
def predict_loan(new_data, model, scaler, feature_names, threshold=0.5):
    new_data_df = pd.DataFrame(new_data, columns=feature_names)
    new_data_scaled = scaler.transform(new_data_df)
    probabilities = model.predict_proba(new_data_scaled)[:, 1]
    predictions = (probabilities >= threshold).astype(int)
    result_map = {0: 'No (Will not default)', 1: 'Yes (Will default)'}
    return [{'prediction': result_map[pred], 'probability': prob} for pred, prob in zip(predictions, probabilities)]

print("\nRandomForestClassifier Predictions for New Data:")
rf_predictions = predict_loan(new_features, rf, scaler, feature_names, threshold=0.5)
for i, pred in enumerate(rf_predictions):
    print(f"Sample {i+1}: {pred['prediction']}, Probability: {pred['probability']:.3f}")



RandomForestClassifier (High Accuracy Model)
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.99      0.94     90246
           1       0.51      0.08      0.13     11893

    accuracy                           0.88    102139
   macro avg       0.70      0.53      0.53    102139
weighted avg       0.85      0.88      0.84    102139

Confusion Matrix:
 [[89382   864]
 [10996   897]]
AUC-ROC: 0.7263342632224781

RandomForestClassifier Predictions for New Data:
Sample 1: No (Will not default), Probability: 0.040
Sample 2: Yes (Will default), Probability: 1.000
