In [57]:
# ================================================
# CPU5006: Rule-Based AI Algorithm for Credit Risk Assessment
# Research Question:
# "How fairly and effectively can a rule-based AI algorithm predict
#  the specific credit risk factor of an individual?"
# ================================================

import pandas as pd


In [58]:
# --- Load and inspect dataset ---
df = pd.read_csv('training_data.csv')

print("âœ… Dataset successfully loaded.")
print("Shape:", df.shape)
df.head()


âœ… Dataset successfully loaded.
Shape: (1120, 11)


Unnamed: 0,ID,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,1122,33,male,1,rent,little,moderate,2384,36,repairs,bad
1,49,48,male,2,own,,,5190,27,repairs,good
2,156,39,female,1,own,,moderate,932,6,education,good
3,307,30,male,2,own,,moderate,2028,12,car,good
4,1288,48,male,2,own,little,little,1082,12,car,bad


In [59]:
# --- Clean and normalize data where appropriate ---

# Replace missing or 'nan' values with neutral defaults
df = df.fillna({
    'Saving accounts': 'unknown',
    'Checking account': 'unknown',
    'Housing': 'unknown',
    'Purpose': 'unknown'
})

# Normalize numeric values (Age, Credit amount, Duration)
def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

for col in ['Age', 'Credit amount', 'Duration']:
    df[col] = normalize(df[col])

print("âœ… Data cleaned and normalized.")
df.head()


âœ… Data cleaned and normalized.


Unnamed: 0,ID,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,1122,0.25,male,1,rent,little,moderate,0.11742,0.470588,repairs,bad
1,49,0.517857,male,2,own,unknown,unknown,0.271817,0.338235,repairs,good
2,156,0.357143,female,1,own,unknown,moderate,0.037526,0.029412,education,good
3,307,0.196429,male,2,own,unknown,moderate,0.097832,0.117647,car,good
4,1288,0.517857,male,2,own,little,little,0.04578,0.117647,car,bad


In [60]:
# --- Rule-based system to assign credit risk score ---

def calculate_risk_score(row):
    score = 0

    # --- Age Rules ---
    # Fairness consideration: avoid penalising young applicants too harshly
    if row['Age'] < 0.2:
        score += 2
    elif row['Age'] < 0.4:
        score += 1
    elif row['Age'] < 0.8:
        score += 0
    else:
        score += 0.5  # older ages get a minor fairness adjustment

    # --- Job Type (0 = unemployed, 3 = skilled/professional) ---
    if row['Job'] == 0:
        score += 3
    elif row['Job'] == 1:
        score += 2
    elif row['Job'] == 2:
        score += 1
    else:
        score += 0

    # --- Housing Type ---
    if row['Housing'] == 'own':
        score += 0
    elif row['Housing'] == 'free':
        score += 1
    else:
        score += 2  # renting adds slight risk

    # --- Savings Account ---
    savings = row['Saving accounts']
    if savings in ['rich', 'quite rich']:
        score += 0
    elif savings == 'moderate':
        score += 1
    elif savings == 'little':
        score += 2
    else:
        score += 1.5  # unknown treated neutrally

    # --- Checking Account ---
    checking = row['Checking account']
    if checking == 'rich':
        score += 0
    elif checking == 'moderate':
        score += 1
    elif checking == 'little':
        score += 2
    else:
        score += 1.5  # unknown treated neutrally

    # --- Credit Amount ---
    if row['Credit amount'] < 0.2:
        score += 0
    elif row['Credit amount'] < 0.4:
        score += 1
    elif row['Credit amount'] < 0.6:
        score += 2
    elif row['Credit amount'] < 0.8:
        score += 3
    else:
        score += 4

    # --- Duration of Credit ---
    if row['Duration'] < 0.2:
        score += 0
    elif row['Duration'] < 0.4:
        score += 1
    elif row['Duration'] < 0.6:
        score += 2
    elif row['Duration'] < 0.8:
        score += 3
    else:
        score += 4

    # --- Purpose-based Risk Adjustment ---
    if row['Purpose'] in ['education', 'business', 'furniture']:
        score += 1
    elif row['Purpose'] in ['car', 'appliances']:
        score += 2
    elif row['Purpose'] in ['radio/TV', 'repairs']:
        score += 3
    else:
        score += 2  # unknown = neutral risk

    return score


In [61]:
# --- Apply the rule-based function to all rows ---
df['risk_score'] = df.apply(calculate_risk_score, axis=1)

print("âœ… Risk scores calculated.")
df[['Age', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Credit amount', 'Duration', 'risk_score']].head()


âœ… Risk scores calculated.


Unnamed: 0,Age,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,risk_score
0,0.25,1,rent,little,moderate,0.11742,0.470588,13.0
1,0.517857,2,own,unknown,unknown,0.271817,0.338235,9.0
2,0.357143,1,own,unknown,moderate,0.037526,0.029412,6.5
3,0.196429,2,own,unknown,moderate,0.097832,0.117647,7.5
4,0.517857,2,own,little,little,0.04578,0.117647,7.0


In [62]:
# --- Categorise individuals based on risk score ---

def risk_category(score):
    if score <= 6:
        return 'Minimal Risk'
    elif score <= 10:
        return 'Small Risk'
    elif score <= 14:
        return 'Mild Risk'
    elif score <= 18:
        return 'High Risk'
    else:
        return 'Extreme Risk'

df['risk_category'] = df['risk_score'].apply(risk_category)

print("âœ… Risk categories assigned.")
df[['risk_score', 'risk_category']].head()


âœ… Risk categories assigned.


Unnamed: 0,risk_score,risk_category
0,13.0,Mild Risk
1,9.0,Small Risk
2,6.5,Small Risk
3,7.5,Small Risk
4,7.0,Small Risk


In [63]:
# --- Display a sample of the final output ---
df[['ID', 'Age', 'Job', 'Housing', 'Saving accounts', 'Checking account',
    'Credit amount', 'Duration', 'Purpose', 'risk_score', 'risk_category']].head(15)


Unnamed: 0,ID,Age,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,risk_score,risk_category
0,1122,0.25,1,rent,little,moderate,0.11742,0.470588,repairs,13.0,Mild Risk
1,49,0.517857,2,own,unknown,unknown,0.271817,0.338235,repairs,9.0,Small Risk
2,156,0.357143,1,own,unknown,moderate,0.037526,0.029412,education,6.5,Small Risk
3,307,0.196429,2,own,unknown,moderate,0.097832,0.117647,car,7.5,Small Risk
4,1288,0.517857,2,own,little,little,0.04578,0.117647,car,7.0,Small Risk
5,463,0.267857,2,own,rich,moderate,0.178607,0.382353,furniture/equipment,6.0,Minimal Risk
6,1099,0.392857,2,own,little,little,0.135468,0.470588,furniture/equipment,10.0,Small Risk
7,713,0.25,2,own,little,moderate,0.054749,0.205882,radio/TV,9.0,Small Risk
8,1258,0.107143,2,rent,unknown,little,0.061682,0.294118,car,11.5,Mild Risk
9,853,0.107143,2,own,little,moderate,0.257676,0.514706,radio/TV,12.0,Mild Risk


In [64]:
# --- Basic fairness evaluation ---
# Check correlations to ensure no single variable dominates the score.
correlations = df[['Age', 'Credit amount', 'Duration', 'risk_score']].corr()

print("ðŸ“Š Correlation Matrix (Fairness Diagnostic):")
print(correlations)

# Optional: Average risk per housing type
avg_risk_by_housing = df.groupby('Housing')['risk_score'].mean()
print("\nAverage Risk Score by Housing Type:")
print(avg_risk_by_housing)


ðŸ“Š Correlation Matrix (Fairness Diagnostic):
                    Age  Credit amount  Duration  risk_score
Age            1.000000       0.061492 -0.052229   -0.436347
Credit amount  0.061492       1.000000  0.602313    0.473256
Duration      -0.052229       0.602313  1.000000    0.571362
risk_score    -0.436347       0.473256  0.571362    1.000000

Average Risk Score by Housing Type:
Housing
free     9.689394
own      8.685039
rent    11.152655
Name: risk_score, dtype: float64
