In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB, ComplementNB
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('C:\\Users\\world\\Desktop\\MSAAI\\ML Fundamentals\\application_train.csv')

### data cleaning

In [3]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 122 columns):
 #    Column                        Dtype  
---   ------                        -----  
 0    SK_ID_CURR                    int64  
 1    TARGET                        int64  
 2    NAME_CONTRACT_TYPE            object 
 3    CODE_GENDER                   object 
 4    FLAG_OWN_CAR                  object 
 5    FLAG_OWN_REALTY               object 
 6    CNT_CHILDREN                  int64  
 7    AMT_INCOME_TOTAL              float64
 8    AMT_CREDIT                    float64
 9    AMT_ANNUITY                   float64
 10   AMT_GOODS_PRICE               float64
 11   NAME_TYPE_SUITE               object 
 12   NAME_INCOME_TYPE              object 
 13   NAME_EDUCATION_TYPE           object 
 14   NAME_FAMILY_STATUS            object 
 15   NAME_HOUSING_TYPE             object 
 16   REGION_POPULATION_RELATIVE    float64
 17   DAYS_BIRTH                    int64  
 18   DA

Making new target column from 'NAME_CONTRACT_STATUS' column in Previous_application.csv

In [4]:
X = df.drop(columns=['TARGET', 'SK_ID_CURR'])
y = df['TARGET']

Remove rows with target missing.

In [5]:
missing_target = y.isnull()

X = X[~missing_target]
y = y[~missing_target]

# Reset the indices
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)

Remove blank/uniform columns by calculating the non-null unique values in each column and identifying blank/uniform-value columns.

In [6]:
num_unique = X.nunique()

blank_uniform_cols = num_unique == 1

X = X.drop(blank_uniform_cols[blank_uniform_cols].index, axis=1)

# Print the remaining columns
print(X.columns)

Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE',
       ...
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'AMT_REQ_CREDIT_BUREAU_HOUR',
       'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK',
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object', length=120)


Remove outliers. I am removing very extreme outliers to try to maintain so data integrity.

In [7]:
num_col = X.select_dtypes(include=['number']).columns
cat_col = X.select_dtypes(exclude=['number']).columns

In [8]:
print(len(num_col))

104


In [9]:
z_scores = (X[num_col] - X[num_col].mean()) / X[num_col].std()

In [10]:
threshold = 4

In [11]:
outliers_z = (z_scores > threshold).any(axis=1)

In [12]:
Q1 = X[num_col].quantile(0.03)
Q3 = X[num_col].quantile(0.97)
IQR = Q3 - Q1
outliers_iqr = ((X[num_col] < (Q1 - 1.5 * IQR)) | (X[num_col] > (Q3 + 1.5 * IQR))).any(axis=1)

In [13]:
outliers_combined = outliers_z | outliers_iqr

In [14]:
num_outliers = outliers_combined.sum()
percentage_outliers = (num_outliers / len(X)) * 100
print("Number of outliers:", num_outliers)
print("Percentage of outliers:", percentage_outliers)

Number of outliers: 76131
Percentage of outliers: 24.757163158391084


In [15]:
outlier_mask = (z_scores > threshold)

In [16]:
X_filtered = X[~outliers_combined]
y_filtered = y[~outliers_combined]

X_filtered = X_filtered.reset_index(drop=True)
y_filtered = y_filtered.reset_index(drop=True)

Dealing with missing values in categorical columns. Identifying categorical columns with missing values and calculating their missing percent.

In [17]:
categorical_cols = X.select_dtypes(exclude=['number']).columns

categorical_cols_with_missing_values = categorical_cols[X[categorical_cols].isnull().any().values]

# Print columns with missing values
print(categorical_cols_with_missing_values.tolist())

['NAME_TYPE_SUITE', 'OCCUPATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']


Calculate the percentage of missing values in each categorical column

In [18]:
missing_percentage = X_filtered[categorical_cols_with_missing_values].isnull().mean() * 100

# Print percentage of missing values in each column
print(missing_percentage)

NAME_TYPE_SUITE         0.248941
OCCUPATION_TYPE        33.141585
FONDKAPREMONT_MODE     69.973204
HOUSETYPE_MODE         52.463480
WALLSMATERIAL_MODE     53.013657
EMERGENCYSTATE_MODE    49.629181
dtype: float64


Not many missing values, I will likely drop the NAME_TYPE_SUITE column since there a lot of missing values here.

Filtering columns and rows with a high amount of missing values. I am setting the threshold at 51% to get rid of the NAME_TYPE_SUITE column.

In [19]:
missing_percentage = X.isnull().mean() * 100

missing_threshold = 0.50  # Threshold to get rid of a few columns with a lot of missing values

column_threshold = len(X) * missing_threshold
X_filtered = X.dropna(thresh=column_threshold, axis=1)

# Categorical columns with missing values
categorical_cols = X_filtered.select_dtypes(exclude=['number']).columns
categorical_cols_with_missing_values = categorical_cols[X_filtered[categorical_cols].isnull().any()].tolist()

# Print columns with missing values
print(categorical_cols_with_missing_values)

# Removing rows with missing values
row_missing_threshold = 0.3  # This is an arbitrary threshold
row_percentage_threshold = row_missing_threshold * 100
X_filtered = X_filtered.dropna(axis=0, thresh=X_filtered.shape[1] - row_percentage_threshold)
y_filtered = y[X_filtered.index]

# Print the percentage of missing values in each column
missing_percentage = X_filtered[categorical_cols_with_missing_values].isnull().mean() * 100
print(missing_percentage)

['NAME_TYPE_SUITE', 'OCCUPATION_TYPE', 'EMERGENCYSTATE_MODE']
NAME_TYPE_SUITE         0.420148
OCCUPATION_TYPE        31.345545
EMERGENCYSTATE_MODE    47.398304
dtype: float64


Using imputing so I do not remove NaN values.


In [20]:
# Impute missing values for numerical columns using mean imputation
numerical_cols = X_filtered.select_dtypes(include=['number']).columns
imputer_numeric = SimpleImputer(strategy='mean')
X_filtered[numerical_cols] = imputer_numeric.fit_transform(X_filtered[numerical_cols])

# Impute missing values for categorical columns using mode imputation
categorical_cols = X_filtered.select_dtypes(exclude=['number']).columns
imputer_categorical = SimpleImputer(strategy='most_frequent')
X_filtered[categorical_cols] = imputer_categorical.fit_transform(X_filtered[categorical_cols])


Using LabelEncoder on categorical columns.

In [21]:
label_encoder = LabelEncoder()
X_encoded = X_filtered.copy()

for col in categorical_cols:
    X_encoded[col] = label_encoder.fit_transform(X_encoded[col])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)