# Proof of concept
This jupyter notebook will be used to create our proof of concept for our credit classification model. Once we are happy with our methods and the accuracy of our model we will transfer the code into .py files and organize it for maintainability following OOP best practices. 

--- 

## Data Processing 
__Juan__ will be overseeing the data processing section.

In [None]:
import re

COLUMNS = {
    "Age": {
        "type": "int",
        "valid_range": (18, 122),
        "regex": r"^\d+$",
        "cleaning": "Ensure numeric, no extra characters.",
        "notes": "Reject if outside human age range."
    },

    "Number of Credit Cards": {
        "type": "int",
        "valid_range": (0, 11),
        "regex": r"^\d+$",
        "cleaning": "Ensure integer only.",
        "notes": "Reject non-integer or real numbers."
    },

    "Number of Bank Accounts": {
        "type": "int",
        "valid_range": (0, 11),
        "regex": r"^\d+$",
        "cleaning": "Ensure integer only.",
        "notes": "Reject if outside 0–11 range."
    },

    "Occupation": {
        "type": "str",
        "regex": None,
        "cleaning": "Bag-of-words classification. Normalize capitalization and whitespace.",
        "notes": "Use ‘__________’ to mark ‘no other’. Assign numeric encoding after NLP preprocessing."
    },

    "Annual Income": {
        "type": "float",
        "valid_range": None,
        "regex": r"\d+(?:\.\d+)?",
        "cleaning": "Remove underscores and non-numeric symbols before conversion.",
        "notes": "Check if real number."
    },

    "Monthly In hand Salary": {
        "type": "float",
        "valid_range": None,
        "regex": r"\d+(?:\.\d+)?",
        "cleaning": "Remove underscores and non-numeric characters.",
        "notes": "Tentative column—verify its interpretation."
    },

    "Interest Rate": {
        "type": "float",
        "valid_range": (0, 100),
        "regex": r"\d+(?:\.\d+)?",
        "cleaning": "Ensure percentage format without % sign.",
        "notes": "Reject >100."
    },

    "Number of Loans": {
        "type": "int",
        "valid_range": (0, None),
        "regex": r"^\d+$",
        "cleaning": "Ensure integer only.",
        "notes": "Should match length of 'Type of Loan' list."
    },

    "Type of Loan": {
        "type": "str",
        "regex": None,
        "cleaning": "Tokenize categories, possibly split on commas.",
        "notes": "Correlates to 'Number of Loans'. Verify consistency."
    },

    "Delay from Due Date": {
        "type": "int",
        "valid_range": (0, None),
        "regex": r"^-?\d+$",
        "cleaning": "Raw numeric values only.",
        "notes": "May include negatives if early payments are encoded as negative delays."
    },

    "Num of Delayed Payments": {
        "type": "int",
        "valid_range": (0, None),
        "regex": r"^\d+$",
        "cleaning": "Remove blanks and negatives.",
        "notes": "Must be non-negative integer."
    },

    "Changed Credit Limit": {
        "type": "float",
        "regex": r"[+-]?\d+(?:\.\d+)?",
        "cleaning": "Ensure numeric string, use regex validation.",
        "notes": "Could be positive or negative depending on direction of change."
    },

    "Number of Credit Inquiries": {
        "type": "int",
        "valid_range": (0, None),
        "regex": r"^\d+$",
        "cleaning": "Ensure integer only.",
        "notes": "Usually small whole number count."
    },

    "Credit Mix": {
        "type": "str",
        "regex": None,
        "cleaning": "Categorical encoding via bag-of-words or label mapping.",
        "notes": "Indicates mix of secured/unsecured credit types."
    },

    "Outstanding Debt": {
        "type": "float",
        "regex": r"\d+(?:\.\d+)?",
        "cleaning": "Ensure numeric, remove formatting symbols.",
        "notes": "Should be non-negative."
    },

    "Credit Utilization Ratio": {
        "type": "float",
        "valid_range": (0, 1),
        "regex": r"\d+(?:\.\d+)?",
        "cleaning": "Convert percentage values (e.g., '45%') to 0.45.",
        "notes": "Should be expressed as a fraction between 0 and 1."
    },

    "Credit History Age": {
        "type": "float",
        "valid_range": (0, None),
        "regex": r"\d+(?:\.\d+)?",
        "cleaning": "Ensure numeric representation (years).",
        "notes": "Derived metric; must be non-negative."
    },

    "Payment Min Amount": {
        "type": "float",
        "regex": r"\d+(?:\.\d+)?",
        "cleaning": "Ensure numeric.",
        "notes": "Check if zero values indicate missing data."
    },

    "Total EMI per Month": {
        "type": "float",
        "regex": r"\d+(?:\.\d+)?",
        "cleaning": "Ensure numeric, remove underscores/commas.",
        "notes": "Non-negative; may be related to number of loans."
    },

    "Amount Invested Monthly": {
        "type": "float",
        "regex": r"\d+(?:\.\d+)?",
        "cleaning": "Ensure numeric.",
        "notes": "Non-negative; financial variable."
    },

    "Payment Behaviours": {
        "type": "str",
        "regex": None,
        "cleaning": "Categorical encoding. NLP or one-hot encoding likely required.",
        "notes": "Represents qualitative behavior descriptors."
    },

    "Monthly Balance": {
        "type": "float",
        "regex": r"\d+(?:\.\d+)?",
        "cleaning": "Ensure numeric, remove formatting symbols.",
        "notes": "May be negative if overdrawn."
    }
}


def validate_column_value(col_name, value):
    info = COLUMNS[col_name]
    if info["regex"] and not re.match(info["regex"], str(value).strip()):
        return False
    if info["valid_range"]:
        min_val, max_val = info["valid_range"]
        num = float(value)
        if (min_val is not None and num < min_val) or (max_val is not None and num > max_val):
            return False
    return True

## Model Creation
__Konrad__ will be overseeing the Model creation section.