PIPELINE

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, balanced_accuracy_score
np.set_printoptions(threshold=np.inf)

- 1. Load of data

In [191]:
data_test = pd.read_csv("orig_test.csv")
data_train = pd.read_csv("orig_train.csv")

- 2. Preprocessing
    * 2.1 Columns removal

In [179]:
class ColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_remove):
        self.columns_to_remove = columns_to_remove

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.drop(columns=self.columns_to_remove)

# Define columns to remove
columns_default_flag = ['DEFAULT_FLAG']
columns_to_remove_low_variance = ['SAVING_ACCOUNT', 'FOREIGN_ACCOUNT', 'DEPOSIT', 'PENSION_FUNDS']
columns_to_remove_correlation_concerns = ['HOUSEHOLD_MEMBERS', 'DEBIT_CARD']
columns_to_remove = columns_default_flag + columns_to_remove_low_variance + columns_to_remove_correlation_concerns

# Create pipeline
columns_remover_pipeline = Pipeline(steps=[
    ('column_remover', ColumnRemover(columns_to_remove))
])


-    * 2.2 Rows removal

In [180]:
class PreprocessingPipeline(BaseEstimator, TransformerMixin):
    def __init__(self, quantile_lower=0.01, quantile_upper=0.99):
        self.quantile_lower = quantile_lower
        self.quantile_upper = quantile_upper

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        data_copy = X.copy()
        
        # Filter out rows where the AREA column value is 'Missing'
        data_copy = data_copy[data_copy['AREA'] != 'Missing']
        
        # Remove outliers
        for col in ['AGE', 'WORK_SENIORITY', 'BUSINESS AGE']:
            lower_bound = data_copy[col].quantile(self.quantile_lower)
            upper_bound = data_copy[col].quantile(self.quantile_upper)
            data_copy[col] = data_copy[col].clip(lower=lower_bound, upper=upper_bound)

        income_upper_bound = data_copy['INCOME'].quantile(self.quantile_upper)
        income_median = data_copy['INCOME'].median()
        data_copy['INCOME'] = np.where(data_copy['INCOME'] > income_upper_bound, income_median, data_copy['INCOME'])

        data_no_outlier_relationship = data_copy[data_copy['LENGTH_RELATIONSHIP_WITH_CLIENT'] < 100]

        data_no_outlier_products = data_no_outlier_relationship[(data_no_outlier_relationship['PRODUCT'] != 'A') & 
                                                                (data_no_outlier_relationship['PRODUCT'] != 'D')]
        
        data_no_outlier_products=data_no_outlier_products.reset_index(drop=True)
        return data_no_outlier_products

preprocessing_pipeline = Pipeline(steps=[
    ('preprocessing', PreprocessingPipeline())
])

- 3. Feature Engineering

In [181]:
# Pipeline for Has_Dependents Column
class HasDependentsTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['HAS_DEPENDENTS'] = X['NO_OF_DEPENDENTS'].apply(lambda x: 0 if x == 0 else 1)
        return X.drop(columns=["NO_OF_DEPENDENTS"])

has_dependents_pipeline = Pipeline(steps=[
    ('has_dependents', HasDependentsTransformer())  # Add the custom transformer  
])

In [194]:
# Pipeline for Age Group
class AgeGroupTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, bins=[20, 30, 40, 50, 60, float('inf')], labels=['20-30', '30-40', '40-50', '50-60', '60+']):
        self.bins = bins
        self.labels = labels

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy['AGE GROUP'] = pd.cut(X_copy['AGE'], bins=self.bins, labels=self.labels, right=False)
        return X_copy

# Create the pipeline
age_group_pipeline = Pipeline(steps=[
    ('age_grouping', AgeGroupTransformer())
])

In [183]:
# Pipeline for relationship category

class RelationshipLengthTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, bins=[0, 5, 10, float('inf')], labels=[0, 1, 2]):
        self.bins = bins
        self.labels = labels

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        X_copy['RELATIONSHIP_LENGTH_CAT'] = pd.cut(X_copy['LENGTH_RELATIONSHIP_WITH_CLIENT'], bins=self.bins, labels=self.labels)
        return X_copy

# Create the pipeline
relationship_pipeline = Pipeline(steps=[
    ('relationship_length', RelationshipLengthTransformer())
])

In [184]:
# Pipeline for new columns 
class NewColumnsTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy.loc[:, 'INCOME_LOG'] = np.log1p(X_copy['INCOME']) + 1/10
        X_copy.loc[:, 'INCOME_AGE_INTERACTION'] = (X_copy['INCOME'] / (X_copy['AGE'] - 21)) + 1/10
        X_copy.loc[:, 'INCOME_SENIORITY_INTERACTION'] = np.log1p(X_copy['INCOME'] / X_copy['WORK_SENIORITY']) + 1/10
        X_copy.loc[:, 'LOYALTY_TO_AGE'] = X_copy['LENGTH_RELATIONSHIP_WITH_CLIENT'] / X_copy['AGE']
        X_copy.loc[:, 'CAREER_STABILITY_RATIO'] = X_copy['WORK_SENIORITY'] / X_copy['AGE']
        X_copy.loc[:, 'SENIORITY_RELATIONSHIP_RATIO'] = np.log1p(X_copy['WORK_SENIORITY'] / X_copy['LENGTH_RELATIONSHIP_WITH_CLIENT']) + 1/10
        X_copy.loc[:, 'WORK_SENIORITY_TO_BUSINESS_AGE'] = X_copy['WORK_SENIORITY'] / X_copy['BUSINESS AGE']
        return X_copy

# Create the pipeline
new_columns_pipeline = Pipeline(steps=[
    ('new_columns', NewColumnsTransformer())
])


In [195]:
# Pipeline for categorical features
categorical_columns = ['PRODUCT', 'AREA', 'RESIDENTIAL_PLACE', 'EDUCATION', 'MARITAL_STATUS', 'ECONOMIC_SECTOR', 'EMPLOYEE_NO','AGE GROUP']

categorical_pipeline = Pipeline(steps=[
    ('onehot', OneHotEncoder(drop = None,sparse=False))
])

-  * 3.2 Feature Standarization

In [186]:
class NumericalScalerTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_cols):
        self.numerical_cols = numerical_cols
        self.scaler = StandardScaler()
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.numerical_cols])
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy[self.numerical_cols] = self.scaler.transform(X_copy[self.numerical_cols])
        return X_copy

# Define numerical columns and binary/categorical columns
numerical_cols = ['AGE', 'WORK_SENIORITY', 'BUSINESS AGE', 'INCOME', 'LENGTH_RELATIONSHIP_WITH_CLIENT']

# Create the pipeline
feature_scaler_pipeline = Pipeline(steps=[
    ('scaler', NumericalScalerTransformer(numerical_cols=numerical_cols))
])


Data selection and modification using pipelines

In [193]:
processed_test_data = data_test.copy()
processed_train_data= data_train.copy()
processed_test_data

Unnamed: 0,PRODUCT,AGE,AREA,RESIDENTIAL_PLACE,EDUCATION,MARITAL_STATUS,HOUSEHOLD_MEMBERS,NO_OF_DEPENDENTS,INCOME,WORK_SENIORITY,...,LENGTH_RELATIONSHIP_WITH_CLIENT,DEBIT_CARD,CURRENT_ACCOUNT,SAVING_ACCOUNT,SALARY_ACCOUNT,FOREIGN_ACCOUNT,FINALIZED_LOAN,DEPOSIT,PENSION_FUNDS,DEFAULT_FLAG
0,F,39,County capital,Other,University,single,1,0,4756.330,2,...,2,0,0,0,0,0,0,0,0,0
1,C,65,County capital,Owner without mortgage,Post secondary school,single,1,0,503.000,5,...,1,1,1,0,0,0,0,0,0,0
2,E,38,County capital,Owner without mortgage,Missing,married,2,0,1514.325,17,...,1,1,1,0,1,0,0,0,0,0
3,B,46,Rural area,Owner without mortgage,Highschool,married,2,0,1050.000,2,...,11,0,0,0,0,0,0,0,0,0
4,C,58,County capital,Owner without mortgage,College,widow,1,0,3770.000,24,...,12,1,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3231,F,30,Urban area,Living with family,University,single,1,0,1410.160,3,...,2,0,0,0,0,0,0,0,0,0
3232,B,28,County capital,Living with family,Highschool,single,1,0,1103.000,6,...,1,0,0,0,0,0,0,0,0,0
3233,B,44,Rural area,Owner without mortgage,University,married,2,0,1784.000,4,...,1,0,0,0,0,0,0,0,0,0
3234,B,46,County capital,Owner without mortgage,Post secondary school,married,2,0,3024.250,3,...,9,1,1,0,0,0,1,0,0,0


In [200]:
processed_test_data = data_test.copy()
processed_train_data= data_train.copy()

# Define a list of pipelines in the order you want to apply them
pipelines = [columns_remover_pipeline,preprocessing_pipeline,has_dependents_pipeline,age_group_pipeline,relationship_pipeline,new_columns_pipeline,feature_scaler_pipeline]


# Apply each pipeline sequentially
for pipeline in pipelines:
    pipeline.fit(processed_test_data)
    processed_test_data = pipeline.transform(processed_test_data)

for pipeline in pipelines:
    pipeline.fit(processed_train_data)
    processed_train_data = pipeline.transform(processed_train_data)

processed_test_data=processed_test_data.reset_index(drop=True)
processed_train_data=processed_train_data.reset_index(drop=True)

#Applying last pipeline to our data
encoded_test_data=categorical_pipeline.fit_transform(processed_test_data[categorical_columns])
encoded_test_df = pd.DataFrame(encoded_test_data, columns=categorical_pipeline.get_feature_names_out(categorical_columns))

encoded_train_data=categorical_pipeline.fit_transform(processed_train_data[categorical_columns])
encoded_train_df = pd.DataFrame(encoded_train_data, columns=categorical_pipeline.get_feature_names_out(categorical_columns))

processed_test_data = processed_test_data.drop(categorical_columns, axis=1).join(encoded_test_df)
processed_test_data
processed_train_data = processed_train_data.drop(categorical_columns, axis=1).join(encoded_train_df)
processed_train_data

Unnamed: 0,AGE,INCOME,WORK_SENIORITY,BUSINESS AGE,LENGTH_RELATIONSHIP_WITH_CLIENT,CURRENT_ACCOUNT,SALARY_ACCOUNT,FINALIZED_LOAN,HAS_DEPENDENTS,RELATIONSHIP_LENGTH_CAT,...,EMPLOYEE_NO_between 11-20,EMPLOYEE_NO_between 21-50,EMPLOYEE_NO_between 251-500,EMPLOYEE_NO_between 501-1.000,EMPLOYEE_NO_between 51-100,AGE GROUP_20-30,AGE GROUP_30-40,AGE GROUP_40-50,AGE GROUP_50-60,AGE GROUP_60+
0,1.590627,-0.461902,-0.293373,-0.151485,-0.826115,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.516296,-0.355827,-0.293373,-0.151485,0.624589,0,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-1.010960,-0.551476,-0.851193,-1.041745,-0.826115,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,-0.341980,-0.080818,0.450386,-0.418563,-0.618872,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.416311,-0.505903,-0.293373,-0.151485,-0.826115,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15004,-0.936629,0.041757,0.264446,-0.062459,-0.826115,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
15005,0.772985,-0.913703,3.797301,2.430270,0.831832,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15006,0.104006,2.326690,0.822265,0.293645,1.246319,0,0,0,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15007,0.995979,1.493491,0.822265,0.293645,-0.826115,1,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


- 4. Building model

In [201]:
train_data = processed_train_data
test_data = processed_test_data

In [203]:
#Version with only most important columns

selected_features = ['LENGTH_RELATIONSHIP_WITH_CLIENT',
 'SENIORITY_RELATIONSHIP_RATIO',
 'RELATIONSHIP_LENGTH_CAT',
 'INCOME_AGE_INTERACTION',
 'INCOME_SENIORITY_INTERACTION',
 'INCOME',
 'CAREER_STABILITY_RATIO',
 'BUSINESS AGE',
 'CURRENT_ACCOUNT',
 'MARITAL_STATUS_married']

X_train = train_data[selected_features]

y_train = train_data['FINALIZED_LOAN']

X_test = test_data[selected_features]

y_test = test_data['FINALIZED_LOAN']

In [204]:
# Initialize models

params = {
    'var_smoothing': 1e-9
}

nb_model = GaussianNB(**params)

# Fit models
nb_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_nb = nb_model.predict(X_test)

# Calculate accuracies
accuracy_nb = balanced_accuracy_score(y_test, y_pred_nb)

recall_nb = recall_score(y_test, y_pred_nb)

gini_nb = 2 * roc_auc_score(y_test, y_pred_nb) - 1



accuracy_nb, recall_nb, gini_nb

(0.8466818398348595, 0.8971428571428571, 0.6933636796697189)