In [None]:
# Rule based credit risk algorithm
# by james laliberte
#"How fairly and effectively can a rule-based AI algorithm predict credit risk?"
# Searchbar outputs in bottom cell

# code refinement and error handling was supported by ChatGPT GPT 5 model, 
# OpenAI, https://chat.openai.com/, accessed from 13/10/25 to 2/11/25.

import pandas as pd  #pandas chosen for structured data handling



In [None]:
# Load the dataset from CSV 
df = pd.read_csv('training_data.csv')  # reads the credit dataset into a DataFrame
df['ID'] = pd.to_numeric(df['ID'], errors='coerce')

print("Dataset successfully loaded.")  # confirmation message for debugging and transparency
print("Shape:", df.shape)  # prints the number of rows and columns for inspection
df.head()  # shows the first few records to check that the data imported correctly


Dataset successfully loaded.
Shape: (1120, 11)


Unnamed: 0,ID,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,1122,33,male,1,rent,little,moderate,2384,36,repairs,bad
1,49,48,male,2,own,,,5190,27,repairs,good
2,156,39,female,1,own,,moderate,932,6,education,good
3,307,30,male,2,own,,moderate,2028,12,car,good
4,1288,48,male,2,own,little,little,1082,12,car,bad


In [None]:
# Fill missing values in key categorical columns with 'unknown' so that no record breaks the rule logic
df = df.fillna({
    'Saving accounts': 'unknown', # prevents missing saving account data from breaking comparisons
    'Checking account': 'unknown',  # same logic for checking accounts
    'Housing': 'unknown',   # ensures housing is never null
    'Purpose': 'unknown'  # important since loan purpose impacts scoring rules
})

# Define a function for min-max normalisation of numeric columns to ensure fair scaling between 0 and 1
def minmax_norm(series):
    min_v, max_v = series.min(), series.max()  # identify the range of the feature
    if max_v == min_v:   # prevent divide-by-zero if all values are the same
        return pd.Series([0.0]*len(series), index=series.index)
    return (series - min_v) / (max_v - min_v)  # normalise to range [0,1]

# Apply normalisation to continuous numeric features used in scoring
df['Age_norm'] = minmax_norm(df['Age'])     # younger = higher value after scaling
df['CreditAmount_norm'] = minmax_norm(df['Credit amount']) # normalised to compare fairly between applicants
df['Duration_norm'] = minmax_norm(df['Duration'])     # standardised duration for fair comparison

print("Cleaning complete. Created normalised features: Age_norm, CreditAmount_norm, Duration_norm")
df[['Age','Age_norm','Credit amount','CreditAmount_norm','Duration','Duration_norm']].head()  # quick preview


Cleaning complete. Created normalised features: Age_norm, CreditAmount_norm, Duration_norm


Unnamed: 0,Age,Age_norm,Credit amount,CreditAmount_norm,Duration,Duration_norm
0,33,0.25,2384,0.11742,36,0.470588
1,48,0.517857,5190,0.271817,27,0.338235
2,39,0.357143,932,0.037526,6,0.029412
3,30,0.196429,2028,0.097832,12,0.117647
4,48,0.517857,1082,0.04578,12,0.117647


In [4]:
# This adjusted version slightly increases the sensitivity of the scoring system.
# The goal is to identify more individuals in the higher risk band while maintaining fairness.

def calculate_risk_score(row):
    score = 0.0

    # young applicants are slightly higher risk due to limited credit history
    a = row['Age_norm']
    if a < 0.2:       score += 2.5
    elif a < 0.4:     score += 1.5
    elif a < 0.8:     score += 0
    else:             score += 0.5

    # unemployment heavily penalised, low-skill jobs now contribute a bit more
    j = row['Job']
    if j == 0:        score += 4
    elif j == 1:      score += 2
    elif j == 2:      score += 1
    else:             score += 0

    # owning lowers risk; renting/free accommodation adds some risk
    h = str(row['Housing']).lower()
    if h == 'own':        score += 0
    elif h == 'free':     score += 1.5
    else:                 score += 2.5

    # low or unknown savings increase risk
    s = str(row['Saving accounts']).lower()
    if s in ['rich', 'quite rich']: score += 0
    elif s == 'moderate':           score += 1
    elif s == 'little':             score += 2
    else:                           score += 2.5

    # similar logic to savings
    c = str(row['Checking account']).lower()
    if c in ['rich', 'quite rich']: score += 0
    elif c == 'moderate':           score += 1
    elif c == 'little':             score += 2
    else:                           score += 2.5

    #high loan requests are a stronger signal of potential default
    ca = row['CreditAmount_norm']
    if ca < 0.2:          score += 0
    elif ca < 0.4:        score += 1
    elif ca < 0.6:        score += 2
    elif ca < 0.8:        score += 3.5
    else:                 score += 5

    #long repayment periods increase risk 
    d = row['Duration_norm']
    if d < 0.2:           score += 0
    elif d < 0.4:         score += 1
    elif d < 0.6:         score += 2
    elif d < 0.8:         score += 3.5
    else:                 score += 5

    # high-risk consumer loans (e.g., appliances, repairs) penalised more
    p = str(row['Purpose']).lower()
    if p in ['education', 'business', 'furniture']:
        score += 1
    elif p in ['car', 'appliances']:
        score += 2.5
    elif p in ['radio/tv', 'radio/tv ', 'repairs']:
        score += 4
    else:
        score += 2.5

    return score


In [None]:
# Apply the rule-based scoring to every individual record
df['risk_score'] = df.apply(calculate_risk_score, axis=1)  # vectorised application over DataFrame

# Convert numeric score into a human-readable category for interpretability
def risk_category(score):
    if score <= 6:   return 'Minimal Risk'  # excellent profile
    if score <= 8:  return 'Small Risk' # safe but with minor factors
    if score <= 10:  return 'Mild Risk' # noticeable risk
    if score <= 14:  return 'High Risk' # concerning profile
    return 'Extreme Risk'     # very high probability of default

df['risk_category'] = df['risk_score'].apply(risk_category)  # attach classification to dataset


In [6]:
# Displays every possible risk category, even if count = 0
def show_risk_summary(as_series: bool = False):
    # Count the number of people in each category
    counts = df['risk_category'].value_counts()

    # Always include all expected categories in logical order
    order = ['Minimal Risk', 'Small Risk', 'Mild Risk', 'High Risk', 'Extreme Risk']
    ordered = [(cat, int(counts.get(cat, 0))) for cat in order]

    print("Overall distribution of individuals by risk category:\n")
    for cat, n in ordered:
        print(f"{cat}: {n} people")

    # Optional return for further analysis (keeps notebook clean by default)
    if as_series:
        return pd.Series({cat: n for cat, n in ordered}, name="count")

# Run the function to display the full distribution
show_risk_summary()


Overall distribution of individuals by risk category:

Minimal Risk: 42 people
Small Risk: 109 people
Mild Risk: 294 people
High Risk: 551 people
Extreme Risk: 124 people


In [None]:
from IPython.display import display

def show_person_info():
    has_name = 'Name' in df.columns

    while True:
        user_input = input(f"Enter an ID{' or Name' if has_name else ''} (or 'exit'): ").strip()
        if user_input.lower() == 'exit':
            print("Exiting.")
            break

        # Search for matching record either by numeric ID or by name substring
        person = pd.DataFrame()
        if user_input.isdigit() and 'ID' in df.columns:
            id_val = int(user_input)
            person = df.loc[df['ID'] == id_val]

        # fallback to case-insensitive name contains
        if person.empty and has_name:
            needle = user_input.lower()
            person = df.loc[df['Name'].astype(str).str.lower().str.contains(needle, na=False)]

        if person.empty:
            print("No match. Try again.\n")
            continue

        row = person.iloc[0]

        # Evaluate which features likely contributed most to the applicantâ€™s total risk score
        factors = []
        if row['Age_norm'] < 0.3: factors.append("young age")
        if row['Job'] == 0: factors.append("unemployed")
        if str(row['Housing']).lower() != 'own': factors.append("non-owned housing")
        if str(row['Saving accounts']).lower() in ['little','unknown']: factors.append("low savings")
        if str(row['Checking account']).lower() in ['little','unknown']: factors.append("low checking balance")
        if row['CreditAmount_norm'] > 0.7: factors.append("high credit amount")
        if row['Duration_norm'] > 0.6: factors.append("long loan duration")
        if str(row['Purpose']).lower() in ['radio/tv','repairs','appliances']: factors.append("high-risk loan purpose")

        # Show the selected record in a clean table
        cols = ['ID','Age','Sex','Job','Housing','Saving accounts','Checking account',
                'Credit amount','Duration','Purpose','risk_score','risk_category']  # display columns
        display(row[cols].to_frame().T)  # pretty display of record in table form
        # Output the final calculated risk and the qualitative reasoning
        print(f"Risk score: {row['risk_score']:.2f} | Category: {row['risk_category']}")  # shows numeric and label output
        if factors:
            print("Main factors increasing risk:", ", ".join(factors) + ".")  # human-readable explanation of causes
        else:
            print("No major high risk factors detected.")  # indicates stable applicant
        print()  # adds a blank line 


show_person_info()