In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore")

### Import annual feature data; Extract X and y

In [2]:
annual_features = pd.read_csv('annual_features.csv')

In [3]:
X = annual_features.drop(['response'], axis = 1)
y = annual_features['response']

### Split train and test set and compute the completeness of X_train

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=2/3, random_state=1147)

print("The percentage of non-zero elements in X_train is: {:.2f} %".format(np.count_nonzero(X_train)/ X_train.size * 100))

The percentage of non-zero elements in X_train is: 63.66 %


In [5]:
colnames = X_train.columns
completeness = []
for i in range(len(X_train.columns)):
    completeness.append(np.count_nonzero(X_train.iloc[:,i]) /len(X_train.iloc[:,i]) * 100)
    # Since the dataset doesnot contain NaN value, count completeness using the non-zero counts

### Standardize the train and test set

In [6]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
# rf = RandomForestClassifier(random_state=1148)
# params = {'n_estimators':[20,50,200,500],'max_features':["auto", "sqrt", "log2"]}
# # Gridsearch best hyperparameters
# grid = GridSearchCV(rf, param_grid=params, scoring='accuracy', cv=5)
# grid.fit(X_train, y_train)
# grid.best_params_

In [8]:
rf = RandomForestClassifier(n_estimators = 500, max_features="auto", random_state=1148)
rf.fit(X_train, y_train) 
importances = rf.feature_importances_

Using GridSearchCV tuning the best parameters of random forest classifier, and fit the model get the feature importance.

### Create the completeness-feature importance graph

In [9]:
feature_importance = pd.DataFrame(importances, index=colnames, columns=['Feature Importance'])
feature_importance = feature_importance.sort_values(by=['Feature Importance'], ascending=False)
feature_importance.head(10)

Unnamed: 0,Feature Importance
2014_ann_txn_amt_sum,0.048159
2013_ann_txn_amt_sum,0.036371
2014_ann_txn_amt_cnt,0.029129
2013_ann_txn_amt_ave,0.02713
2014_ann_txn_amt_ave,0.026992
2012_ann_txn_amt_ave,0.024073
2013_ann_txn_amt_sem,0.023371
2012_ann_txn_amt_sem,0.023228
2014_ann_txn_amt_max,0.021767
2013_ann_txn_amt_var,0.021751


In [10]:
Completeness = pd.DataFrame(log_completeness, index=colnames, columns=['Completeness'])
Completeness = Completeness.sort_values(by=['Completeness'], ascending=False)
Completeness.head(10)

NameError: name 'log_completeness' is not defined

In [None]:
plt.figure(figsize=(8,6))
log_completeness = np.log(np.array(completeness))
plt.plot(log_completeness, importances, 'o', color='black')
plt.xlabel('Completeness(%)-log scale')
plt.ylabel('Feature Importance')
plt.title('Completeness vs Feature Importance')
plt.show()

#### Comment

There is no missing value in this dataset, as in previous assignments the missing values are filled with 0. This the completeness is computed by count the non-zero element in each column of the training set.   
The completeness-feature importance graph above show s a strong correlation between completeness of training set in log-scale and feature importance. As the completeness of feature increases, the feature importance grows from 0.00 to 0.05. Features with completeness greater than 4.5 (log scale) are have higher feature importance, the highest would be approximate 0.05.  
From the graph, higher completeness tends to have higher feature importance. Thus, the data should be redesigned with the features with higher completeness, which means there should be less missing values in the data. When data collecting, all useful information should be collected.