In [112]:
import pandas as pd

# Load the dataset
df = pd.read_csv('data5.csv')

# Show the first few rows of the dataframe
df.head()


Unnamed: 0,Age,AnnualIncome,CreditScore,EmploymentStatus,LoanAmount,LoanDuration,MaritalStatus,NumberOfDependents,HomeOwnershipStatus,BankruptcyHistory,PaymentHistory,TotalAssets,TotalLiabilities,MonthlyIncome,NetWorth,MonthlyLoanPayment,RiskScorePercentage
0,45,39948,617,Employed,13152,48,Married,2,Own,0,29,146111,19183,3329.0,126928,419.805992,53.333333
1,38,39709,628,Employed,26045,48,Single,1,Mortgage,0,21,53204,9595,3309.083333,43609,794.054238,53.333333
2,47,40724,570,Employed,17627,36,Married,2,Rent,0,20,25176,128874,3393.666667,5205,666.406688,66.666667
3,58,69084,545,Employed,37898,96,Single,1,Mortgage,0,27,104822,5370,5757.0,99452,1047.50698,50.0
4,37,103264,594,Employed,9184,36,Married,1,Mortgage,0,26,244305,17286,8605.333333,227019,330.179141,40.0


In [113]:
# Handling missing values by filling with median for numerical columns
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = df[col].fillna(df[col].median())

# Handling missing values by filling with mode for categorical columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# Check if the missing values are filled
df.isnull().sum()


Age                    0
AnnualIncome           0
CreditScore            0
EmploymentStatus       0
LoanAmount             0
LoanDuration           0
MaritalStatus          0
NumberOfDependents     0
HomeOwnershipStatus    0
BankruptcyHistory      0
PaymentHistory         0
TotalAssets            0
TotalLiabilities       0
MonthlyIncome          0
NetWorth               0
MonthlyLoanPayment     0
RiskScorePercentage    0
dtype: int64

In [114]:
# Create new features
df['DebtToIncomeRatio'] = df['MonthlyLoanPayment'] / df['MonthlyIncome']
df['LoanToNetWorthRatio'] = df['LoanAmount'] / df['NetWorth']

#Check new features
df.head()


Unnamed: 0,Age,AnnualIncome,CreditScore,EmploymentStatus,LoanAmount,LoanDuration,MaritalStatus,NumberOfDependents,HomeOwnershipStatus,BankruptcyHistory,PaymentHistory,TotalAssets,TotalLiabilities,MonthlyIncome,NetWorth,MonthlyLoanPayment,RiskScorePercentage,DebtToIncomeRatio,LoanToNetWorthRatio
0,45,39948,617,Employed,13152,48,Married,2,Own,0,29,146111,19183,3329.0,126928,419.805992,53.333333,0.126106,0.103618
1,38,39709,628,Employed,26045,48,Single,1,Mortgage,0,21,53204,9595,3309.083333,43609,794.054238,53.333333,0.239962,0.597239
2,47,40724,570,Employed,17627,36,Married,2,Rent,0,20,25176,128874,3393.666667,5205,666.406688,66.666667,0.196368,3.386551
3,58,69084,545,Employed,37898,96,Single,1,Mortgage,0,27,104822,5370,5757.0,99452,1047.50698,50.0,0.181954,0.381068
4,37,103264,594,Employed,9184,36,Married,1,Mortgage,0,26,244305,17286,8605.333333,227019,330.179141,40.0,0.038369,0.040455


In [115]:
import pandas as pd

# Assuming you already have a DataFrame 'df' with the necessary columns
# Handle missing values (as we did before)
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = df[col].fillna(df[col].median())

for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# Encode categorical variables using get_dummies (one-hot encoding)
df_encoded = pd.get_dummies(df, drop_first=True)


In [116]:
# Assuming 'RiskScore' is the target
X = df_encoded.drop(columns=['RiskScorePercentage'])
y = df_encoded['RiskScorePercentage']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [117]:
# Import necessary libraries
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Dictionary to store models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Initialize a dictionary to store the results
results = {}

# Iterate over each model, train it, and evaluate
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store the results
    results[name] = {
        'Mean Squared Error': mse,
        'R-squared': r2
    }

# Print the results for all models
for model_name, metrics in results.items():
    print(f"{model_name}:")
    print(f"  Mean Squared Error: {metrics['Mean Squared Error']:.2f}")
    print(f"  R-squared: {metrics['R-squared']:.2f}")
    print()




Linear Regression:
  Mean Squared Error: 19.55
  R-squared: 0.84

Ridge Regression:
  Mean Squared Error: 19.55
  R-squared: 0.84

Lasso Regression:
  Mean Squared Error: 60.64
  R-squared: 0.49

Random Forest:
  Mean Squared Error: 0.79
  R-squared: 0.99

Gradient Boosting:
  Mean Squared Error: 0.32
  R-squared: 1.00



In [118]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define and train the Gradient Boosting model
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)

In [133]:
def predict_risk():
    # Predefined inputs for risk assessment
    Age = 70
    AnnualIncome = 13500
    CreditScore = 536
    LoanAmount = 670000
    LoanDuration = 3# in months
    NumberOfDependents = 0
    TotalAssets = 1000000
    TotalLiabilities = 1000
    MonthlyIncome = 87000
    NetWorth = 150000
    MonthlyLoanPayment = 0
    BankruptcyHistory = 0

    # Create new features
    DebtToIncomeRatio = MonthlyLoanPayment / MonthlyIncome
    LoanToNetWorthRatio = LoanAmount / NetWorth

    # One-hot encoded categorical variables
    EmploymentStatus_Self_Employed = 0  # Not self-employed
    EmploymentStatus_Unemployed = 0     # Not unemployed
    MaritalStatus_Married = 0     # Married
    MaritalStatus_Single = 1      # Not single
    MaritalStatus_Widowed = 0           # Not widowed
    HomeOwnershipStatus_Other = 1      # Not 'Other' category
    HomeOwnershipStatus_Own = 0        # Own home
    HomeOwnershipStatus_Rent = 0        # Not renting

    # Construct the input DataFrame with all required columns, matching the training set
    user_input = pd.DataFrame({
        'Age': [Age],
        'AnnualIncome': [AnnualIncome],
        'CreditScore': [CreditScore],
        'LoanAmount': [LoanAmount],
        'LoanDuration': [LoanDuration],
        'NumberOfDependents': [NumberOfDependents],
        'BankruptcyHistory': [0],  # No bankruptcy
        'PaymentHistory': [1],     # Good payment history
        'TotalAssets': [TotalAssets],
        'TotalLiabilities': [TotalLiabilities],
        'MonthlyIncome': [MonthlyIncome],
        'NetWorth': [NetWorth],
        'MonthlyLoanPayment': [MonthlyLoanPayment],
        'DebtToIncomeRatio': [DebtToIncomeRatio],
        'LoanToNetWorthRatio': [LoanToNetWorthRatio],
        'EmploymentStatus_Self-Employed': [EmploymentStatus_Self_Employed],
        'EmploymentStatus_Unemployed': [EmploymentStatus_Unemployed],
        'MaritalStatus_Married': [MaritalStatus_Married],
        'MaritalStatus_Single': [MaritalStatus_Single],
        'MaritalStatus_Widowed': [MaritalStatus_Widowed],
        'HomeOwnershipStatus_Other': [HomeOwnershipStatus_Other],
        'HomeOwnershipStatus_Own': [HomeOwnershipStatus_Own],
        'HomeOwnershipStatus_Rent': [HomeOwnershipStatus_Rent],
        'BankruptcyHistory' : [BankruptcyHistory]
    })

    # Predict the risk score using the trained Random Forest model
    predicted_risk = gb_model.predict(user_input)

    # Output the prediction
    print(f"Predicted Risk Score: {predicted_risk[0]:.2f}")

# Example to run the function
predict_risk()


Predicted Risk Score: 87.02


In [49]:
print(rf_model.feature_names_in_)


['Age' 'AnnualIncome' 'CreditScore' 'LoanAmount' 'LoanDuration'
 'NumberOfDependents' 'BankruptcyHistory' 'PaymentHistory' 'TotalAssets'
 'TotalLiabilities' 'MonthlyIncome' 'NetWorth' 'MonthlyLoanPayment'
 'DebtToIncomeRatio' 'LoanToNetWorthRatio'
 'EmploymentStatus_Self-Employed' 'EmploymentStatus_Unemployed'
 'MaritalStatus_Married' 'MaritalStatus_Single' 'MaritalStatus_Widowed'
 'HomeOwnershipStatus_Other' 'HomeOwnershipStatus_Own'
 'HomeOwnershipStatus_Rent']


In [111]:
# Define an updated risk score calculation function
def calculate_risk_score(row):
    risk_score = 0
    
    # Age-based risk: Younger (<30) and older (50+) applicants have higher risk for high loan amounts
    if row['Age'] < 30:
        if row['LoanAmount'] > 50000:  # Medium risk for younger applicants with high loan amounts
            risk_score += 2
        elif row['LoanAmount'] == 0:  # No loan means low risk
            risk_score += 0.5
        else:
            risk_score += 1
    elif row['Age'] >= 50:
        if row['LoanAmount'] > 50000:  # Higher risk for older applicants with high loan amounts
            risk_score += 3
        else:
            risk_score += 1.5
    else:
        risk_score += 1  # Default medium risk for middle-aged applicants

    # Annual income risk: Higher income reduces risk
    if row['AnnualIncome'] > 100000:
        risk_score += 1
    elif 50000 <= row['AnnualIncome'] <= 100000:
        risk_score += 2
    else:
        risk_score += 3

    # Loan amount risk: Larger loans increase risk
    if row['LoanAmount'] > 200000:
        risk_score += 3
    elif 100000 <= row['LoanAmount'] <= 200000:
        risk_score += 2
    else:
        risk_score += 1

    # Loan-to-income ratio: Higher ratio increases risk
    loan_to_income_ratio = row['LoanAmount'] / row['AnnualIncome'] if row['AnnualIncome'] > 0 else 0
    if loan_to_income_ratio > 5:
        risk_score += 3
    elif 3 <= loan_to_income_ratio <= 5:
        risk_score += 2
    else:
        risk_score += 1

    # Credit score risk: Lower credit scores increase risk
    if row['CreditScore'] >= 750:
        risk_score += 1
    elif 700 <= row['CreditScore'] < 750:
        risk_score += 2
    else:
        risk_score += 3

    # Homeownership reduces risk slightly
    if row['HomeOwnershipStatus'] in ['Own', 'Mortgage']:
        risk_score -= 2

    # Bankruptcy history: If the applicant has a bankruptcy history, increase risk
    if row['BankruptcyHistory'] == 1:
        risk_score += 2

    # Net worth: Higher net worth decreases risk
    if row['NetWorth'] >= 500000:
        risk_score -= 4
    elif 250000 <= row['NetWorth'] < 500000:
        risk_score -= 2
    else:
        risk_score += 1

    # Loan Duration: Longer loan durations with high loan amounts increase risk
    if row['LoanDuration'] < 10 and row['LoanAmount'] > 100000:
        risk_score += 2

    # Employment status: 'Unemployed' increases risk
    if row['EmploymentStatus'] == 'Unemployed':
        risk_score += 2

    return risk_score

# Define the maximum risk score possible (adjusted for higher potential risk)
MAX_RISK_SCORE = 15

# Apply the risk score calculation to the filtered dataset
filtered_df['RiskScorePercentage'] = (filtered_df.apply(calculate_risk_score, axis=1) / MAX_RISK_SCORE) * 100

# Save the filtered dataset to a CSV file
filtered_df.to_csv('data5.csv', index=False)

# Print summary statistics
print(f"Risk Score Percentage Statistics:")
print(f"Average Risk Score Percentage: {filtered_df['RiskScorePercentage'].mean():.2f}%")
print(f"Max Risk Score Percentage: {filtered_df['RiskScorePercentage'].max():.2f}%")
print(f"Min Risk Score Percentage: {filtered_df['RiskScorePercentage'].min():.2f}%")

print("\nFiltered synthetic data with risk score percentage saved to 'data2.csv'")


Risk Score Percentage Statistics:
Average Risk Score Percentage: 55.93%
Max Risk Score Percentage: 100.00%
Min Risk Score Percentage: 6.67%

Filtered synthetic data with risk score percentage saved to 'data2.csv'
