In [None]:
## IMPORTS ##

# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib for plotting
import matplotlib.pyplot as plt

# garbage collector
import gc

# Load data fcn
def load_credit_data(data_path):
    csv_path = os.path.join("data", data_path)
    return pd.read_csv(csv_path)

# Load training data
training_df = load_credit_data ("training_merged_preprocessed_id.csv")
print (training_df.shape)

# Load test data
testing_df = load_credit_data ("testing_merged_preprocessed_id.csv")
print (testing_df.shape)

# Load importance data
feature_importances = load_credit_data ("feature_importances_merged_sorted.csv")

# Find poor features (arbitrary threshold 0.0005)
poor_features = list(feature_importances[feature_importances['importance'] < 0.0005]['feature'])
print('There are %d features with less 0.0005 importance' % len(poor_features))

# Remove those features
top_training_df = training_df.drop(columns = poor_features)
top_testing_df = testing_df.drop(columns = poor_features)

print('Training shape: ', top_training_df.shape)
print('Testing shape: ', top_testing_df.shape)

# Threshold for removing correlated variables
threshold = 0.9

# Absolute value correlation matrix
corr_matrix = top_training_df.corr().abs()
corr_matrix.head()

# Upper triangle of correlations
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
upper.head()

# Select columns with correlations above threshold
corr_columns = [column for column in upper.columns if any(upper[column] > threshold)]

print('There are %d columns to remove.' % (len(corr_columns)))

top_training_df = top_training_df.drop(columns = corr_columns)
top_testing_df = top_testing_df.drop(columns = corr_columns)

print('Training shape: ', top_training_df.shape)
print('Testing shape: ', top_testing_df.shape)



(307511, 510)
(48744, 510)
There are 175 features with less 0.0005 importance
Training shape:  (307511, 335)
Testing shape:  (48744, 335)
There are 67 columns to remove.
Training shape:  (307511, 268)
Testing shape:  (48744, 268)


In [None]:
top_training_df.to_csv('training_top_id.csv', index = False)
top_testing_df.to_csv('testing_top_id.csv', index = False)