In [3]:
!pip install autogluon




In [7]:
# imports
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd

# Load the Loan Approval dataset
loan_data_path = r'C:\Users\mmuib\Loan_Approval_f.csv'  # Ensure correct path
loan_df = pd.read_csv(loan_data_path)

# Check for missing values in target columns
print(loan_df[['LoanAmount', 'Credit_History']].isnull().sum())

# Ensure 'Credit_History' is categorical (1 for 'Yes' and 0 for 'No')
loan_df['Credit_History'] = loan_df['Credit_History'].map({'Yes': 1, 'No': 0})

# Check again for missing values after mapping
print(loan_df[['LoanAmount', 'Credit_History']].isnull().sum())

# Drop rows with missing target values (NaN in 'LoanAmount' or 'Credit_History')
loan_df.dropna(subset=['LoanAmount', 'Credit_History'], inplace=True)

# Convert the dataframe to TabularDataset (AutoGluon format)
loan_tabular = TabularDataset(loan_df)

# Display the first few rows of the dataset
print(loan_tabular.head())

# Manually split the dataset into training and testing sets (70% train, 30% test)
train_data = loan_tabular.sample(frac=0.7, random_state=42)
test_data = loan_tabular.drop(train_data.index)

# Train a classification model to predict Credit_History
credit_history_predictor = TabularPredictor(label='Credit_History').fit(train_data)

# Evaluate the model
credit_history_performance = credit_history_predictor.evaluate(test_data)
print("Model performance for Credit_History prediction:", credit_history_performance)

# Save the model
credit_history_predictor.save('credit_history_predictor_model')


No path specified. Models will be saved in: "AutogluonModels\ag-20250413_214352"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.4
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          12
Memory Avail:       2.63 GB / 15.68 GB (16.8%)
Disk Space Avail:   750.04 GB / 952.80 GB (78.7%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	preset

LoanAmount        0
Credit_History    0
dtype: int64
LoanAmount        0
Credit_History    0
dtype: int64
  Married  Dependents     Education Self_Employed  ApplicantIncome_Annual  \
0      No           0      Graduate            No                   58490   
1     Yes           1      Graduate            No                   45830   
2     Yes           0      Graduate           Yes                   30000   
3     Yes           0  Not Graduate            No                   25830   
4      No           0      Graduate            No                   60000   

   CoapplicantIncome_Annual  TotalFamilyIncome_Annual  LoanAmount  \
0                       0.0                   58490.0      120000   
1                   15080.0                   60910.0      128000   
2                       0.0                   30000.0       66000   
3                   23580.0                   49410.0      120000   
4                       0.0                   60000.0      141000   

   Loan_Amount_T

	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  : 2 | ['CoapplicantIncome_Annual', 'TotalFamilyIncome_Annual']
		('int', [])    : 4 | ['Dependents', 'ApplicantIncome_Annual', 'LoanAmount', 'Loan_Amount_Term_Year']
		('object', []) : 4 | ['Married', 'Education', 'Self_Employed', 'Property_Area']
	Types of features in processed data (raw dtype, special dtypes):
		('category', [])  : 1 | ['Property_Area']
		('float', [])     : 2 | ['CoapplicantIncome_Annual', 'TotalFamilyIncome_Annual']
		('int', [])       : 4 | ['Dependents', 'ApplicantIncome_Annual', 'LoanAmount', 'Loan_Amount_Term_Year']
		('int', ['bool']) : 3 | ['Married', 'Education', 'Self_Employed']
	0.2s = Fit runtime
	10 features in original data used to generate 10 features in processed data.
	Train Data (Processed) Memory Usage: 0.02 MB (0.0% of available memory)
Data p

Model performance for Credit_History prediction: {'accuracy': 0.8858695652173914, 'balanced_accuracy': 0.5, 'mcc': 0.0, 'roc_auc': 0.4405492258252995, 'f1': 0.9394812680115274, 'precision': 0.8858695652173914, 'recall': 1.0}


# AutoGluon Classification: Predicting Credit History

In this notebook, we will use AutoGluon to build a classification model that predicts whether an applicant has a good or bad credit history. The target column, `Credit_History`, contains binary values: `1` for good credit history and `0` for bad credit history.

AutoGluon simplifies the process of training a classification model by automating the selection of the best model and hyperparameters.


## Data Definitions

The dataset consists of various features, including:
- **`ApplicantIncome_Annual`**: The annual income of the applicant.
- **`CoapplicantIncome_Annual`**: The annual income of the coapplicant.
- **`Credit_History`**: The target column indicating the applicant's credit history (`1` for good, `0` for bad).

In this notebook, we will predict the `Credit_History` based on the available features in the dataset.


## Model Building

In [5]:
# imports
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd

In [6]:
# Load the Loan Approval dataset
loan_data_path = r'C:\Users\mmuib\Loan_Approval_f.csv'  # Ensure correct path
loan_df = pd.read_csv(loan_data_path)

In [7]:
# Check for missing values in target columns
print(loan_df[['LoanAmount', 'Credit_History']].isnull().sum())


LoanAmount        0
Credit_History    0
dtype: int64


In [8]:
# Ensure 'Credit_History' is categorical (1 for 'Yes' and 0 for 'No')
loan_df['Credit_History'] = loan_df['Credit_History'].map({'Yes': 1, 'No': 0})


In [10]:
# Check again for missing values after mapping
print(loan_df[['LoanAmount', 'Credit_History']].isnull().sum())


LoanAmount        0
Credit_History    0
dtype: int64


In [15]:
# Drop rows with missing target values (NaN in 'LoanAmount' or 'Credit_History')
loan_df.dropna(subset=['LoanAmount', 'Credit_History'], inplace=True)


In [17]:
# Convert the dataframe to TabularDataset (AutoGluon format)
loan_tabular = TabularDataset(loan_df)

In [19]:
# Display the first few rows of the dataset
print(loan_tabular.head())

  Married  Dependents     Education Self_Employed  ApplicantIncome_Annual  \
0      No           0      Graduate            No                   58490   
1     Yes           1      Graduate            No                   45830   
2     Yes           0      Graduate           Yes                   30000   
3     Yes           0  Not Graduate            No                   25830   
4      No           0      Graduate            No                   60000   

   CoapplicantIncome_Annual  TotalFamilyIncome_Annual  LoanAmount  \
0                       0.0                   58490.0      120000   
1                   15080.0                   60910.0      128000   
2                       0.0                   30000.0       66000   
3                   23580.0                   49410.0      120000   
4                       0.0                   60000.0      141000   

   Loan_Amount_Term_Year  Credit_History Property_Area  
0                      6               1         Urban  
1       

In [21]:
# Manually split the dataset into training and testing sets (70% train, 30% test)
train_data = loan_tabular.sample(frac=0.7, random_state=42)
test_data = loan_tabular.drop(train_data.index)

In [23]:
# Train a classification model to predict Credit_History
credit_history_predictor = TabularPredictor(label='Credit_History').fit(train_data)

No path specified. Models will be saved in: "AutogluonModels\ag-20250413_220135"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.4
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          12
Memory Avail:       2.25 GB / 15.68 GB (14.4%)
Disk Space Avail:   749.99 GB / 952.80 GB (78.7%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	preset

In [25]:
# Evaluate the model
credit_history_performance = credit_history_predictor.evaluate(test_data)
print("Model performance for Credit_History prediction:", credit_history_performance)


Model performance for Credit_History prediction: {'accuracy': 0.8858695652173914, 'balanced_accuracy': 0.5, 'mcc': 0.0, 'roc_auc': 0.4405492258252995, 'f1': 0.9394812680115274, 'precision': 0.8858695652173914, 'recall': 1.0}


In [27]:
# Save the model
credit_history_predictor.save('credit_history_predictor_model')