In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Read the train and test datasets

In [2]:
x_train = pd.read_parquet('/content/drive/MyDrive/Project/American Express Default/Dataset/x_train.parquet')
x_test = pd.read_parquet('/content/drive/MyDrive/Project/American Express Default/Dataset/x_test.parquet')
y_train = pd.read_csv('/content/drive/MyDrive/Project/American Express Default/Dataset/y_train.csv')
y_test = pd.read_csv('/content/drive/MyDrive/Project/American Express Default/Dataset/y_test.csv')

In [3]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321239 entries, 0 to 321238
Columns: 168 entries, Day to D_68_6
dtypes: float32(26), int16(9), int64(3), int8(75), uint8(55)
memory usage: 84.6 MB


In [4]:
x_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137674 entries, 0 to 137673
Columns: 168 entries, Day to D_68_6
dtypes: float32(26), int16(9), int64(3), int8(75), uint8(55)
memory usage: 36.2 MB


In [5]:
y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321239 entries, 0 to 321238
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   target  321239 non-null  int64
dtypes: int64(1)
memory usage: 2.5 MB


In [6]:
y_test.head(n = 5)

Unnamed: 0,target
0,0
1,1
2,1
3,0
4,1


## Split into numerical and categorical variables

In [7]:
categorical_variable = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
                        'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

# use a regular expression to match columns that begin with the categories
one_hot_columns = x_train.filter(regex='^(' + '|'.join(categorical_variable) + ')')

# print the matching columns
cat_cols = list(one_hot_columns.columns)
x_train_categorical = x_train[cat_cols]

# get the names of the numerical columns
num_cols = list(set(x_train.columns) - set(one_hot_columns.columns))
x_train_numerical = x_train[num_cols]

## Feature Selection

Getter function for mutual information (numerical variables)

In [8]:
from sklearn.feature_selection import mutual_info_classif

def mutual_info_df(x_train_variable, y_train, variable_type):
  # Compute mutual information between each feature in X and y
  mutual_info = mutual_info_classif(x_train_variable, y_train, random_state=42)

  # Create a DataFrame to store the mutual information scores for each feature
  mi_df = pd.DataFrame({'feature': variable_type, 'mi_score': mutual_info})

  # Sort the features by their mutual information scores in descending order
  mi_df = mi_df.sort_values(by='mi_score', ascending=False)

  # REturn the mutual information scores for each feature
  return mi_df

Getter function for chi-square (categorical variables)

In [25]:
from sklearn.feature_selection import chi2, SelectKBest

# feature selection
def chi2_select_features(X_train, y_train):
  # Set seed to 42
  np.random.seed(42)
  fs = SelectKBest(score_func=chi2, k='all')
  fs.fit(X_train, y_train)
  return fs

def chi_square_df(x_train_categorical,fs1):
  # Create dataframe to store results
  results_df = pd.DataFrame(columns=['feature', 'Chi-Square Score'])

  # Loop through each feature and store score
  for i in range(len(x_train_categorical.columns)):
      feature = x_train_categorical.columns[i]
      score = fs1.scores_[i]
      results_df = pd.concat([results_df,
                              pd.DataFrame({'feature': [feature],
                                                        'Chi-Square Score': [score]})])
  # Sort results dataframe by Score column in descending order
  results_df = results_df.sort_values(by='Chi-Square Score', ascending=False)

  # Reset the index without adding the old index as a column
  results_df = results_df.reset_index(drop=True)

  # Return results dataframe
  return results_df

#### Numerical variables

Check for pairwise correlation between the numerical variables

In [11]:
# Compute the correlation matrix using numpy
corr_matrix = np.corrcoef(x_train_numerical, rowvar=False)

In [12]:
corr_matrix

array([[ 1.00000000e+00,  2.80184110e-02,  2.00878831e-01, ...,
         8.36539624e-02,  1.54148809e-02,  1.09710102e-01],
       [ 2.80184110e-02,  1.00000000e+00,  9.05980557e-02, ...,
         2.50781984e-01, -5.69105646e-04, -2.68832312e-02],
       [ 2.00878831e-01,  9.05980557e-02,  1.00000000e+00, ...,
         1.81510459e-01,  1.68986262e-02,  1.18604929e-02],
       ...,
       [ 8.36539624e-02,  2.50781984e-01,  1.81510459e-01, ...,
         1.00000000e+00,  6.71581339e-03,  3.24243816e-02],
       [ 1.54148809e-02, -5.69105646e-04,  1.68986262e-02, ...,
         6.71581339e-03,  1.00000000e+00,  3.05555281e-03],
       [ 1.09710102e-01, -2.68832312e-02,  1.18604929e-02, ...,
         3.24243816e-02,  3.05555281e-03,  1.00000000e+00]])

In [14]:
# Mask the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

fig, ax = plt.subplots(figsize=(60,60))
# Create a heatmap of the correlation matrix using seaborn
sns.heatmap(corr_matrix, mask = mask,
            cmap='coolwarm', center=0,
            annot=True, fmt='.2f',
            xticklabels=x_train_numerical.columns,
            yticklabels=x_train_numerical.columns,
            cbar = False)

Output hidden; open in https://colab.research.google.com to view.

In [15]:
# Find the indices of the elements in the corr_matrix array that have an absolute value > 0.7
high_corr_indices = np.where(np.abs(corr_matrix) >= 0.7)

# Create a set of variable names with high collinearity
high_corr_vars = set()
for i, j in zip(high_corr_indices[0], high_corr_indices[1]):
    if i != j and np.abs(corr_matrix[i, j]) > 0.7:
        high_corr_vars.add(x_train_numerical.columns[i])
        high_corr_vars.add(x_train_numerical.columns[j])
        
# Print the set of variable names with high collinearity
len(high_corr_vars)

36

In [34]:
listhigh_corr_vars

{'B_1',
 'B_11',
 'B_14',
 'B_16',
 'B_18',
 'B_20',
 'B_23',
 'B_28',
 'B_33',
 'B_36',
 'B_4',
 'B_7',
 'D_103',
 'D_107',
 'D_111',
 'D_123',
 'D_125',
 'D_135',
 'D_136',
 'D_137',
 'D_138',
 'D_139',
 'D_143',
 'D_44',
 'D_58',
 'D_74',
 'D_75',
 'R_13',
 'R_2',
 'R_21',
 'R_4',
 'R_5',
 'R_8',
 'S_15',
 'S_20',
 'S_8'}

Drop these collinear columns

In [35]:
x_train_numerical = x_train_numerical.drop(high_corr_vars, axis=1)

In [41]:
# Subtract high_corr_vars from num_cols to get the remaining columns
remaining_num_cols = list(set(num_cols) - high_corr_vars)

# Convert remaining_cols back to a list and print
print(list(remaining_num_cols))

['D_78', 'D_91', 'D_108', 'D_71', 'D_96', 'B_19', 'D_83', 'D_65', 'R_22', 'D_92', 'B_12', 'D_122', 'D_86', 'R_18', 'D_145', 'D_39', 'S_19', 'S_6', 'D_81', 'D_59', 'R_25', 'R_24', 'D_84', 'S_17', 'D_51', 'B_21', 'D_87', 'D_82', 'B_24', 'S_5', 'D_113', 'D_80', 'D_109', 'B_10', 'D_79', 'D_94', 'D_49', 'B_5', 'R_1', 'R_16', 'Month', 'R_15', 'R_28', 'P_4', 'R_6', 'S_11', 'D_89', 'D_127', 'R_19', 'B_9', 'D_106', 'B_41', 'Day', 'R_11', 'D_47', 'S_18', 'B_22', 'D_140', 'D_70', 'B_31', 'D_93', 'D_129', 'R_9', 'S_16', 'B_32', 'R_20', 'D_72', 'R_17', 'R_3', 'Year', 'S_13', 'S_12', 'R_10', 'D_60', 'D_124', 'R_23', 'R_26']


In [None]:
x_train_numerical.info()

Mutual information gain

In [42]:
mf1 = mutual_info_df(x_train_numerical, y_train, remaining_num_cols)

  y = column_or_1d(y, warn=True)


In [45]:
mf1.tail(n = 10)

Unnamed: 0,feature,mi_score
4,D_96,0.002513
8,R_22,0.002222
32,D_109,0.001465
42,R_28,0.000783
20,R_25,0.000541
75,R_23,0.000429
13,R_18,0.000402
16,S_19,0.000176
60,D_93,0.000164
55,S_18,0.0


Drop S_18 as well

In [47]:
remaining_num_cols.remove('S_18')

#### Categorical variables

###### Mutual information

In [None]:
mf2 = mutual_info_df(x_train_categorical, y_train['target'], cat_cols)

###### Chi-square test

In [None]:
fs1 = chi2_select_features(x_train_categorical,y_train)
cat_df = chi_square_df(x_train_categorical,fs1)

In [28]:
cat_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   feature           55 non-null     object 
 1   Chi-Square Score  55 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1008.0+ bytes


In [27]:
mf2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 1 to 4
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   feature   55 non-null     object 
 1   mi_score  55 non-null     float64
dtypes: float64(1), object(1)
memory usage: 1.3+ KB


In [None]:
cat_df1 = pd.merge(cat_df,mf2, on = 'feature')
cat_df2 = cat_df1.sort_values(by = ['Chi-Square Score', 'mi_score'])

In [49]:
cat_df2.head(n = 10)

Unnamed: 0,feature,Chi-Square Score,mi_score
54,D_64_1,0.328338,0.0
53,D_63_2,0.923026,9.1e-05
51,B_38_-1,1.910925,0.0
52,B_30_-1,1.910925,0.00011
50,D_68_0,5.021705,0.0
49,D_68_5,15.854065,0.003779
48,D_63_5,17.219361,0.0
47,D_66_0,32.696543,0.0
46,D_63_1,39.388909,0.0
45,D_117_4,60.226377,0.004252


Drop D_64_1, D_63_2, B_38_-1, B_30_-1, D_68_0	columns due to low mi and chi-square scores

In [50]:
cat_cols.remove('D_64_1')
cat_cols.remove('D_63_2')
cat_cols.remove('B_38_-1')
cat_cols.remove('B_30_-1')
cat_cols.remove('D_68_0')

In [64]:
len(cat_cols)

50

Combine the cat_cols and the numerical cols again

In [None]:
final_cols = cat_cols + remaining_num_cols

Recreate the x_train and x_test dataframe again

In [55]:
x_train1 = x_train[final_cols]
x_test1 = x_test[final_cols]

In [62]:
x_train1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321239 entries, 0 to 321238
Columns: 126 entries, B_30_0 to R_26
dtypes: float32(17), int16(7), int64(3), int8(49), uint8(50)
memory usage: 62.8 MB


In [56]:
x_test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137674 entries, 0 to 137673
Columns: 126 entries, B_30_0 to R_26
dtypes: float32(17), int16(7), int64(3), int8(49), uint8(50)
memory usage: 26.9 MB


## Adasyn the x_train dataset

Feature selection must happen first before the upsampling method.

https://stackoverflow.com/questions/63375860/sampling-before-or-after-feature-selection

In [57]:
from imblearn.over_sampling import ADASYN
from collections import Counter

# 238059, 83180
# define the ADASYN model with a ratio of 0.65 for minority class
adasyn = ADASYN(sampling_strategy=0.65)

# fit and resample the training data
X_resampled, y_resampled = adasyn.fit_resample(x_train1, y_train)

In [61]:
X_resampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 396347 entries, 0 to 396346
Columns: 126 entries, B_30_0 to R_26
dtypes: float32(17), int16(7), int64(3), int8(49), uint8(50)
memory usage: 77.5 MB


## Export the dataset

In [58]:
import os

In [59]:
x_train1.to_parquet('/content/drive/MyDrive/Project/American Express Default/Dataset/x_train_baseline.parquet', index = False)
x_test1.to_parquet('/content/drive/MyDrive/Project/American Express Default/Dataset/x_test_baseline.parquet', index = False)

In [60]:
X_resampled.to_parquet('/content/drive/MyDrive/Project/American Express Default/Dataset/x_train_adasyn.parquet', index = False)
y_resampled.to_parquet('/content/drive/MyDrive/Project/American Express Default/Dataset/y_train_adasyn.parquet', index = False)