In [131]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures

In [132]:
train = pd.read_csv("Darrel_Dataset/train.csv")
test = pd.read_csv("Darrel_Dataset/test.csv")

In [133]:
train.shape, test.shape

((307507, 238), (48744, 237))

Welcome to the first part of feature engineering. Here, we are gonna explore some feature that can be generated by multiplying, dividing, subtracting, and adding 2 (or more) column(s). 

In [134]:
import sys

def return_size(df):
    """Return size of dataframe in gigabytes"""
    return round(sys.getsizeof(df) / 1e9, 2)

def convert_types(df, print_info = False):
    
    original_memory = df.memory_usage().sum()
    
    # Iterate through each column
    for c in df:
        
        # Convert ids and booleans to integers
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
            
        # Convert objects to category
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        
        # Booleans mapped to integers
        elif list(df[c].unique()) == [1, 0]:
            df[c] = df[c].astype(bool)
        
        # Float64 to float32
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
            
        # Int64 to int32
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    new_memory = df.memory_usage().sum()
    
    if print_info:
        print(f'Original Memory Usage: {round(original_memory / 1e9, 2)} gb.')
        print(f'New Memory Usage: {round(new_memory / 1e9, 2)} gb.')
        
    return df

First of all, let's look at the correlation matrix of the train and test dataset that we produced.

In [135]:
corr = train.corr()

In [136]:
plt.figure(figsize=(200,200))
sns.heatmap(corr, annot=True)
plt.show()

Let's start by doing the same thing as Will have done, which is to make polynomial feature using `EXT_SOURCE` and `DAYS_BIRTH`

In [137]:
def polynomial_feature(df1, df2, columns, target, degree, save):
    # Make a new dataframe for polynomial features
    poly_features = df1[columns + [target]]
    poly_features_test = df2[columns]

    # imputer for handling missing values
    imputer = SimpleImputer(strategy='median')

    poly_target = poly_features[target]

    poly_features = poly_features.drop(columns=[target])

    # Need to impute missing values
    poly_features = imputer.fit_transform(poly_features)
    poly_features_test = imputer.transform(poly_features_test)
                                    
    # Create the polynomial object with specified degree
    poly_transformer = PolynomialFeatures(degree=degree)

    # Train the polynomial features
    poly_transformer.fit(poly_features)

    # Transform the features
    poly_features = poly_transformer.transform(poly_features)
    poly_features_test = poly_transformer.transform(poly_features_test)

    # Create a dataframe of the features 
    poly_features = pd.DataFrame(poly_features, 
                                columns=poly_transformer.get_feature_names_out(columns))

    poly_features_test = pd.DataFrame(poly_features_test, 
                                columns=poly_transformer.get_feature_names_out(columns))

    # Add in the target
    poly_features[target] = poly_target

    print("Poly features shape is", poly_features.shape)

    # Find the correlations with the target
    poly_corrs = poly_features.corr()[target].sort_values()
    print("Top 20 correlations with the target:")
    print(poly_corrs.head(20))
    print("\nBottom 20 correlations with the target:")
    print(poly_corrs.tail(20))


        # Merge polynomial features into training dataframe
    poly_features['SK_ID_CURR'] = df1['SK_ID_CURR']
    app_train_poly = df1.merge(poly_features, on='SK_ID_CURR', how='left')

    # Merge polynomial features into testing dataframe
    poly_features_test['SK_ID_CURR'] = df2['SK_ID_CURR']
    app_test_poly = df2.merge(poly_features_test, on='SK_ID_CURR', how='left')

    # Align the dataframes
    app_train_poly, app_test_poly = app_train_poly.align(app_test_poly, join='inner', axis=1)

    app_train_poly[target] = poly_target

    # Print out the new shapes
    print('\nTraining data with polynomial features shape: ', app_train_poly.shape)
    print('Testing data with polynomial features shape:  ', app_test_poly.shape)

    if save:
        return app_train_poly, app_test_poly
    else:
        return df1, df2


In [138]:
def apply_operations_to_columns(df1, df2, columns):
    new_df1_addition = df1[columns].copy()
    new_df2_addition = df2[columns].copy()

    new_df1_subtraction = df1[columns].copy()
    new_df2_subtraction = df2[columns].copy()

    new_df1_division = df1[columns].copy()
    new_df2_division = df2[columns].copy()


    for col1 in columns:
        for col2 in columns:
            if col1 != col2:
                new_col1 = f"{col1}_plus_{col2}"
                new_col2 = f"{col1}_plus_{col2}"
                new_df1_addition[new_col1] = df1[col1] + df1[col2]
                new_df2_addition[new_col2] = df2[col1] + df2[col2]
    
    new_df1_addition["TARGET"] = df1["TARGET"]
    new_df1_addition_corr = new_df1_addition.corr()["TARGET"].sort_values()


    for col1 in columns:
        for col2 in columns:
            if col1 != col2:
                new_col1 = f"{col1}_minus_{col2}"
                new_col2 = f"{col1}_minus_{col2}"
                new_df1_subtraction[new_col1] = df1[col1] - df1[col2]
                new_df2_subtraction[new_col2] = df2[col1] - df2[col2]
    
    new_df1_subtraction["TARGET"] = df1["TARGET"]
    new_df1_subtraction_corr = new_df1_subtraction.corr()["TARGET"].sort_values()


    for col1 in columns:
        for col2 in columns:
            if col1 != col2:
                new_col1 = f"{col1}_divided_by_{col2}"
                new_col2 = f"{col1}_divided_by_{col2}"
                new_df1_division[new_col1] = df1[col1] / df1[col2]
                new_df2_division[new_col2] = df2[col1] / df2[col2]

    new_df1_division["TARGET"] = df1["TARGET"]
    new_df1_division_corr = new_df1_division.corr()["TARGET"].sort_values()

    correlations = pd.concat([new_df1_addition_corr.to_frame(), new_df1_subtraction_corr.to_frame(), new_df1_division_corr.to_frame()])
    corr_filtered = pd.concat([correlations.head(20), correlations.tail(20)])


    return corr_filtered


Let's see polynomial feature correlation with target!

In [139]:
columns_list = ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'DAYS_EMPLOYED']
target_col = 'TARGET'

train, test = polynomial_feature(train, test, columns_list, target_col, degree=3, save=False)

Poly features shape is (307507, 57)
Top 20 correlations with the target:
EXT_SOURCE_2 EXT_SOURCE_3                -0.193943
EXT_SOURCE_1 EXT_SOURCE_2 EXT_SOURCE_3   -0.189608
EXT_SOURCE_2 EXT_SOURCE_3 DAYS_BIRTH     -0.181288
EXT_SOURCE_2^2 EXT_SOURCE_3              -0.176431
EXT_SOURCE_2 EXT_SOURCE_3^2              -0.172287
EXT_SOURCE_1 EXT_SOURCE_2                -0.166625
EXT_SOURCE_1 EXT_SOURCE_3                -0.164070
EXT_SOURCE_2                             -0.160294
EXT_SOURCE_2 DAYS_BIRTH                  -0.156874
EXT_SOURCE_1 EXT_SOURCE_2^2              -0.156867
EXT_SOURCE_3                             -0.155899
EXT_SOURCE_1 EXT_SOURCE_2 DAYS_BIRTH     -0.155892
EXT_SOURCE_1 EXT_SOURCE_3 DAYS_BIRTH     -0.151820
EXT_SOURCE_1 EXT_SOURCE_3^2              -0.150827
EXT_SOURCE_3 DAYS_BIRTH                  -0.150114
EXT_SOURCE_2^2                           -0.149512
EXT_SOURCE_2^2 DAYS_BIRTH                -0.149314
EXT_SOURCE_3^2 DAYS_BIRTH                -0.141782
EXT_SOURC

Looking at the features that we have made by adding, subtracting, and dividing columns

In [140]:
apply_operations_to_columns(train, test, ['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'DAYS_EMPLOYED'])

Unnamed: 0,TARGET
EXT_SOURCE_3_plus_EXT_SOURCE_2,-0.223863
EXT_SOURCE_2_plus_EXT_SOURCE_3,-0.223863
EXT_SOURCE_1_plus_EXT_SOURCE_3,-0.212388
EXT_SOURCE_3_plus_EXT_SOURCE_1,-0.212388
EXT_SOURCE_1_plus_EXT_SOURCE_2,-0.194887
EXT_SOURCE_2_plus_EXT_SOURCE_1,-0.194887
EXT_SOURCE_3,-0.178926
EXT_SOURCE_2,-0.160471
EXT_SOURCE_1,-0.155317
DAYS_BIRTH_plus_DAYS_EMPLOYED,-0.083362


Good! We've done some feature engineering with `DAYS` and `EXT_SOURCE` columns. Now we can choose which columns should we use

In [149]:
train["EXT_SOURCE_3_plus_EXT_SOURCE_2"] = train["EXT_SOURCE_3"] + train["EXT_SOURCE_2"]
train["EXT_SOURCE_1_plus_EXT_SOURCE_3"] = train["EXT_SOURCE_1"] + train["EXT_SOURCE_3"]
train["EXT_SOURCE_1_plus_EXT_SOURCE_2"] = train["EXT_SOURCE_1"] + train["EXT_SOURCE_2"]
train["DAYS_BIRTH_divided_by_EXT_SOURCE_1"] = train["DAYS_BIRTH"] / train["EXT_SOURCE_1"]

In [142]:
test["EXT_SOURCE_3_plus_EXT_SOURCE_2"] = test["EXT_SOURCE_3"] + test["EXT_SOURCE_2"]
test["EXT_SOURCE_1_plus_EXT_SOURCE_3"] = test["EXT_SOURCE_1"] + test["EXT_SOURCE_3"]
test["EXT_SOURCE_1_plus_EXT_SOURCE_2"] = test["EXT_SOURCE_1"] + test["EXT_SOURCE_2"]
test["DAYS_BIRTH_divided_by_EXT_SOURCE_1"] = test["DAYS_BIRTH"] / test["EXT_SOURCE_1"]

After multiple experiments, these are the columns that produce good result.

Okay, now let's try to do feature engineering using domain knowledge!

In [143]:
train['CREDIT_INCOME_PERCENT'] = train['AMT_CREDIT'] / train['AMT_INCOME_TOTAL']
train['ANNUITY_INCOME_PERCENT'] = train['AMT_ANNUITY'] / train['AMT_INCOME_TOTAL']
train['CREDIT_TERM'] = train['AMT_ANNUITY'] / train['AMT_CREDIT']
train['DAYS_EMPLOYED_PERCENT'] = train['DAYS_EMPLOYED'] / train['DAYS_BIRTH']

test['CREDIT_INCOME_PERCENT'] = test['AMT_CREDIT'] / test['AMT_INCOME_TOTAL']
test['ANNUITY_INCOME_PERCENT'] = test['AMT_ANNUITY'] / test['AMT_INCOME_TOTAL']
test['CREDIT_TERM'] = test['AMT_ANNUITY'] / test['AMT_CREDIT']
test['DAYS_EMPLOYED_PERCENT'] = test['DAYS_EMPLOYED'] / test['DAYS_BIRTH']

In [144]:
y_train_label = train["TARGET"]
train = train.drop("TARGET", axis=1)
train, test = train.align(test, join='inner', axis=1)
train["TARGET"] = y_train_label

In [145]:
# train.to_csv("Darrel_Dataset/train_fe_little_domain.csv", index=False)
# test.to_csv("Darrel_Dataset/test_fe_little_domain.csv", index=False)

This dataset is getting considerably higher score, this means that we are on the right path. Next up, we want to do another feature engineering that is often used by kaggle grandmaster which is column aggregation. See you on the next notebook!