In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [2]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
import warnings

warnings.filterwarnings('ignore')

# 1. Loading the Dataset
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')

# Display the first few rows of the train and test datasets
print("Training Data Sample:\n", train_data.head())
print("\nTest Data Sample:\n", test_data.head())

# 2. Data Preprocessing and EDA

# Check for missing values in the dataset
print("\nMissing values in training data:\n", train_data.isnull().sum())

# Imputing missing values: We will use median/mode imputation based on the feature type.
# Categorical columns will use mode, numerical columns will use median.
for column in train_data.columns:
    if train_data[column].dtype == 'object':
        train_data[column].fillna(train_data[column].mode()[0], inplace=True)
    else:
        train_data[column].fillna(train_data[column].median(), inplace=True)

# Apply the same imputation to the test dataset
for column in test_data.columns:
    if test_data[column].dtype == 'object':
        test_data[column].fillna(test_data[column].mode()[0], inplace=True)
    else:
        test_data[column].fillna(test_data[column].median(), inplace=True)

# Encode categorical variables
le = LabelEncoder()
categorical_features = train_data.select_dtypes(include=['object']).columns
for feature in categorical_features:
    train_data[feature] = le.fit_transform(train_data[feature])
    if feature in test_data.columns:
        test_data[feature] = le.transform(test_data[feature])

# Drop irrelevant features
train_data = train_data.drop(['id'], axis=1)
test_data = test_data.drop(['id'], axis=1)

# Separate features and target variable
X = train_data.drop('loan_status', axis=1)
y = train_data['loan_status']

# Train-test split (for local validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 3. CatBoost Model Building

# Define the CatBoost classifier
catboost_model = CatBoostClassifier(iterations=500,
                                    learning_rate=0.05,
                                    depth=6,
                                    loss_function='Logloss',
                                    verbose=0,
                                    random_state=42)

# Train the CatBoost model
catboost_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50)

# 4. Model Evaluation

# Predict probabilities for the validation set (get probabilities for class 'Y')
y_val_prob = catboost_model.predict_proba(X_val)[:, 1]

# Calculate and print accuracy score for threshold-based predictions
y_val_pred = (y_val_prob >= 0.5).astype(int)  # Apply a threshold of 0.5
accuracy = accuracy_score(y_val, y_val_pred)
print("Accuracy Score on Validation Set:", accuracy)

# Display the confusion matrix
conf_matrix = confusion_matrix(y_val, y_val_pred)
print("\nConfusion Matrix:\n", conf_matrix)

# Print classification report to see precision, recall, and F1-score
class_report = classification_report(y_val, y_val_pred)
print("\nClassification Report:\n", class_report)

# 5. Cross-Validation for Stability Check
# Using StratifiedKFold to ensure each fold has the same proportion of target classes
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_val_score(catboost_model, X, y, cv=skf, scoring='accuracy')

print(f'Cross-Validation Accuracy Scores: {cv_results}')
print(f'Mean Cross-Validation Accuracy: {cv_results.mean()}')

# 6. Predictions on Test Data

# Predict probabilities on the test dataset
test_probabilities = catboost_model.predict_proba(test_data)[:, 1] # Probability of class 'Y'

# Prepare submission file with probabilities
submission = pd.DataFrame({
    'id': pd.read_csv('/content/test.csv')['id'],  # Re-attach Loan_ID for test file
    'loan_status': test_probabilities  # Store the probabilities for class 'Y'
})

# Save the submission file
submission.to_csv('/content/predictions.csv', index=False)

print("Submission file created successfully as 'loan_approval_predictions.csv'.")


Training Data Sample:
    id  person_age  person_income person_home_ownership  person_emp_length  \
0   0          37          35000                  RENT                0.0   
1   1          22          56000                   OWN                6.0   
2   2          29          28800                   OWN                8.0   
3   3          30          70000                  RENT               14.0   
4   4          22          60000                  RENT                2.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_percent_income  \
0   EDUCATION          B       6000          11.49                 0.17   
1     MEDICAL          C       4000          13.35                 0.07   
2    PERSONAL          A       6000           8.90                 0.21   
3     VENTURE          B      12000          11.11                 0.17   
4     MEDICAL          A       6000           6.92                 0.10   

  cb_person_default_on_file  cb_person_cred_hist_length  loan_s