In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from matplotlib import pyplot as plt

In [3]:
# Load the application_train dataset
def load_application_train():
    data = pd.read_csv("application_train.csv")
    return data

In [5]:
# Load the dataset
df = load_application_train()
print(df.shape) 

(307511, 122)


In [7]:
# Function to grab column names
def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O" and col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")

    return cat_cols, num_cols, cat_but_car

In [9]:
# Grab column names
cat_cols, num_cols, cat_but_car = grab_col_names(df)

# Remove the irrelevant column 'SK_ID_CURR' from numerical columns
num_cols.remove('SK_ID_CURR')

Observations: 307511
Variables: 122
cat_cols: 54
num_cols: 67
cat_but_car: 1
num_but_cat: 39


In [11]:
# Function to calculate outlier thresholds
def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [13]:
# Function to check for outliers
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [15]:
# Function to grab outliers
def grab_outliers(dataframe, col_name, outlier_index=False, f = 5):
    low, up = outlier_thresholds(dataframe, col_name)
    outliers = dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))]
    
    if outliers.shape[0] > 10:
        print(outliers.head(f))
    else:
        print(outliers)
    
    if outlier_index:
        out_index = outliers.index
        return out_index

In [17]:
# Function to remove outliers
def remove_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    df_without_outliers = dataframe[~((dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit))]
    return df_without_outliers

In [19]:
# Function to replace outliers with thresholds
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [21]:
# Check which numerical columns have outliers
for col in num_cols:
    print(f'{col}: {check_outlier(df, col)}')

CNT_CHILDREN: True
AMT_INCOME_TOTAL: True
AMT_CREDIT: True
AMT_ANNUITY: True
AMT_GOODS_PRICE: True
REGION_POPULATION_RELATIVE: True
DAYS_BIRTH: False
DAYS_EMPLOYED: True
DAYS_REGISTRATION: True
DAYS_ID_PUBLISH: False
OWN_CAR_AGE: True
CNT_FAM_MEMBERS: True
HOUR_APPR_PROCESS_START: True
EXT_SOURCE_1: False
EXT_SOURCE_2: False
EXT_SOURCE_3: False
APARTMENTS_AVG: True
BASEMENTAREA_AVG: True
YEARS_BEGINEXPLUATATION_AVG: True
YEARS_BUILD_AVG: True
COMMONAREA_AVG: True
ELEVATORS_AVG: True
ENTRANCES_AVG: True
FLOORSMAX_AVG: True
FLOORSMIN_AVG: True
LANDAREA_AVG: True
LIVINGAPARTMENTS_AVG: True
LIVINGAREA_AVG: True
NONLIVINGAPARTMENTS_AVG: True
NONLIVINGAREA_AVG: True
APARTMENTS_MODE: True
BASEMENTAREA_MODE: True
YEARS_BEGINEXPLUATATION_MODE: True
YEARS_BUILD_MODE: True
COMMONAREA_MODE: True
ELEVATORS_MODE: True
ENTRANCES_MODE: True
FLOORSMAX_MODE: True
FLOORSMIN_MODE: True
LANDAREA_MODE: True
LIVINGAPARTMENTS_MODE: True
LIVINGAREA_MODE: True
NONLIVINGAPARTMENTS_MODE: True
NONLIVINGAREA_MODE: 

In [23]:
# Display outliers for a few columns
print("\nOutliers in 'CNT_CHILDREN':")
grab_outliers(df, "CNT_CHILDREN")


Outliers in 'CNT_CHILDREN':
     SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
91       100108       0         Cash loans           F            N   
92       100110       0         Cash loans           M            Y   
144      100166       0         Cash loans           F            N   
180      100209       1    Revolving loans           M            N   
182      100211       0         Cash loans           M            N   

    FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
91                Y             3          171000.0    545040.0      31288.5   
92                Y             3          135000.0    373140.0      25065.0   
144               Y             3           58500.0    152820.0      15241.5   
180               Y             3          180000.0    540000.0      27000.0   
182               N             3          225000.0    746280.0      49873.5   

     ...  FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FL

In [25]:
print("\nOutliers in 'AMT_INCOME_TOTAL':")
grab_outliers(df, "AMT_INCOME_TOTAL")


Outliers in 'AMT_INCOME_TOTAL':
    SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
7       100010       0         Cash loans           M            Y   
22      100026       0         Cash loans           F            N   
33      100039       0         Cash loans           M            Y   
49      100056       0         Cash loans           M            Y   
51      100059       0         Cash loans           M            Y   

   FLAG_OWN_REALTY  CNT_CHILDREN  AMT_INCOME_TOTAL  AMT_CREDIT  AMT_ANNUITY  \
7                Y             0          360000.0   1530000.0      42075.0   
22               N             1          450000.0    497520.0      32521.5   
33               N             1          360000.0    733315.5      39069.0   
49               Y             0          360000.0   1506816.0      49927.5   
51               Y             1          540000.0    675000.0      34596.0   

    ...  FLAG_DOCUMENT_18 FLAG_DOCUMENT_19 FLAG_DOCUMENT_20 FLAG_DOCUME

In [27]:
# Remove outliers from numerical columns
for col in num_cols:
    df = remove_outlier(df, col)

print("\nShape after removing outliers:", df.shape)


Shape after removing outliers: (68454, 122)


In [29]:
# Check again for outliers
for col in num_cols:
    print(f'{col}: {check_outlier(df, col)}')

CNT_CHILDREN: False
AMT_INCOME_TOTAL: True
AMT_CREDIT: True
AMT_ANNUITY: True
AMT_GOODS_PRICE: False
REGION_POPULATION_RELATIVE: False
DAYS_BIRTH: False
DAYS_EMPLOYED: True
DAYS_REGISTRATION: True
DAYS_ID_PUBLISH: False
OWN_CAR_AGE: True
CNT_FAM_MEMBERS: False
HOUR_APPR_PROCESS_START: True
EXT_SOURCE_1: False
EXT_SOURCE_2: False
EXT_SOURCE_3: False
APARTMENTS_AVG: True
BASEMENTAREA_AVG: False
YEARS_BEGINEXPLUATATION_AVG: True
YEARS_BUILD_AVG: False
COMMONAREA_AVG: True
ELEVATORS_AVG: True
ENTRANCES_AVG: False
FLOORSMAX_AVG: False
FLOORSMIN_AVG: False
LANDAREA_AVG: True
LIVINGAPARTMENTS_AVG: False
LIVINGAREA_AVG: False
NONLIVINGAPARTMENTS_AVG: True
NONLIVINGAREA_AVG: True
APARTMENTS_MODE: False
BASEMENTAREA_MODE: False
YEARS_BEGINEXPLUATATION_MODE: True
YEARS_BUILD_MODE: False
COMMONAREA_MODE: True
ELEVATORS_MODE: False
ENTRANCES_MODE: False
FLOORSMAX_MODE: False
FLOORSMIN_MODE: False
LANDAREA_MODE: True
LIVINGAPARTMENTS_MODE: False
LIVINGAREA_MODE: False
NONLIVINGAPARTMENTS_MODE: False

In [31]:
# Replace outliers with threshold values for numerical columns
for col in num_cols:
    replace_with_thresholds(df, col)

  dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
  dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
  dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit


In [33]:
# Check again for outliers after replacing
for col in num_cols:
    print(f'{col}: {check_outlier(df, col)}')

CNT_CHILDREN: False
AMT_INCOME_TOTAL: False
AMT_CREDIT: False
AMT_ANNUITY: False
AMT_GOODS_PRICE: False
REGION_POPULATION_RELATIVE: False
DAYS_BIRTH: False
DAYS_EMPLOYED: False
DAYS_REGISTRATION: False
DAYS_ID_PUBLISH: False
OWN_CAR_AGE: False
CNT_FAM_MEMBERS: False
HOUR_APPR_PROCESS_START: False
EXT_SOURCE_1: False
EXT_SOURCE_2: False
EXT_SOURCE_3: False
APARTMENTS_AVG: False
BASEMENTAREA_AVG: False
YEARS_BEGINEXPLUATATION_AVG: False
YEARS_BUILD_AVG: False
COMMONAREA_AVG: False
ELEVATORS_AVG: False
ENTRANCES_AVG: False
FLOORSMAX_AVG: False
FLOORSMIN_AVG: False
LANDAREA_AVG: False
LIVINGAPARTMENTS_AVG: False
LIVINGAREA_AVG: False
NONLIVINGAPARTMENTS_AVG: False
NONLIVINGAREA_AVG: False
APARTMENTS_MODE: False
BASEMENTAREA_MODE: False
YEARS_BEGINEXPLUATATION_MODE: False
YEARS_BUILD_MODE: False
COMMONAREA_MODE: False
ELEVATORS_MODE: False
ENTRANCES_MODE: False
FLOORSMAX_MODE: False
FLOORSMIN_MODE: False
LANDAREA_MODE: False
LIVINGAPARTMENTS_MODE: False
LIVINGAREA_MODE: False
NONLIVINGAPART

## MANAV MALHOTRA
## CT_CSI_DS_4863
## manavmalhotra173@gmail.com
## 9911420736