In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn as sns
import numpy as np
import warnings
import statsmodels.api as sm #Cross-sectional models and methods.
import statsmodels.formula.api as smf #A convenience interface for specifying models using formula strings and DataFrames.
from statsmodels.stats.outliers_influence import variance_inflation_factor
import sklearn
# Function to deal with missing values via imputation
from sklearn.impute import SimpleImputer
# Function that converts categorical values into numerical values via ordinal encoding or one-hot encoding
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
# Function to split data into different groups
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import *
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import random
import math
from scipy.stats import pointbiserialr, spearmanr

from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest

from sklearn.metrics import accuracy_score, roc_auc_score

# for logit regression. 
# statsmodel is chosen because it outputs descriptive stats for the model
import statsmodels.api as sm

# for SVM
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Statistics functions
from scipy.stats import norm
from scipy import stats
from scipy.stats import chi2_contingency
# Suppressing a warning 
warnings.filterwarnings("ignore") 

# It is a magic function that renders the figure in the notebook
%matplotlib inline 

# The style parameters control properties like the color of the background and whether a grid is enabled by default.
sns.set_style("whitegrid")

#### 1.1.2. Loading the Data sets 

In [None]:
df_train = pd.read_csv('nedgroup_training_data.csv') # importing the training data
df_validation = pd.read_csv('nedgroup_validation_data.csv') # importing the validation data
df_test = pd.read_csv('nedgroup_testing_data.csv') # importing the testing data

Seeing that we have some features with null values that are almost have of the total values in the features, we will to drop them; not just because of the null values but because after research done on this project we realized that those features which are; `GENDER`,`SPOUSE _GENDER`, `SPOUSE_RETIREMENT_AGE`, `SPOUSE_DATE_OF_BIRTH` will add no value to the prediction of the `RETIREMENT_FUND_VALUE`.

In [None]:
def drop_features(df):
  df = df.drop(df[['PERCENTAGE_SUCCESS','LA_EAC_PA_INCL_VAT', 'CONFIDENCE_LEVEL', 'RETIREMENT_AGE', 'SPOUSE_RETIREMENT_AGE', 'Unnamed: 0', 'SPOUSE_DATE_OF_BIRTH', 'SPOUSE_GENDER', 'HAS_EMERGENCY_SAVINGS', 'CRITICAL_ILLNESS', 'GENDER', 'FINANCIALLY_SUPPORT_PARTNER', 'FINANCIALLY_SUPPORT_CHILDREN']], axis=1)
  return df

df_train = drop_features(df_train)
df_test = drop_features(df_test)
df_validation = drop_features(df_validation)

In [None]:
df_train = df_train.fillna(0)
df_validation = df_validation.fillna(0)
df_test = df_test.fillna(0)

In [None]:
def accumulated_value(df):
  df['ANNUAL_CHILD_VALUE'] = df['CHILD_MONTHLY_SUPPORTING_VALUE'] * 12
  df['ACCUMULATED_CHILD_VALUE'] = df['ANNUAL_CHILD_VALUE'] * df['YEARS_SUPPORTING_CHILD']
  df['ANNUAL_OTHER_VALUE'] = df['OTHER_MONTHLY_SUPPORTING_VALUE'] * 12
  df['ACCUMULATED_OTHER_VALUE'] = df['ANNUAL_OTHER_VALUE'] * df['YEARS_SUPPORTING_SOMEONE_ELSE']
  df['AANUAL_NET_INCOME'] = df['CURRENT_NET_MONTHLY_INCOME'] * 12
  df = df.drop(df[['CURRENT_NET_MONTHLY_INCOME', 'ANNUAL_CHILD_VALUE', 'CHILD_MONTHLY_SUPPORTING_VALUE', 'YEARS_SUPPORTING_CHILD', 'ANNUAL_OTHER_VALUE', 'OTHER_MONTHLY_SUPPORTING_VALUE', 'YEARS_SUPPORTING_SOMEONE_ELSE']], axis=1)
  return df

In [None]:
df_train = accumulated_value(df_train)
df_test = accumulated_value(df_test)
df_validation = accumulated_value(df_validation)

In [None]:
def get_real_values(df):
    df['ONGOING_COACHING_FEE'] = round(df['ONGOING_COACHING_FEE']/100 * df['RETIREMENT_FUND_VALUE'])
    df['INITIAL_PLANNER_FEE_INCL_VAT_UT'] = round(df['INITIAL_PLANNER_FEE_INCL_VAT_UT']/100 * df['RETIREMENT_FUND_VALUE'])
    df['INITIAL_PLANNER_FEE_INCL_VAT_LA_AND_LAP'] = round(df['INITIAL_PLANNER_FEE_INCL_VAT_LA_AND_LAP']/100 * df['RETIREMENT_FUND_VALUE'])
    df['ONGOING_PLANNER_FEE_INCL_VAT_UT'] = round(df['ONGOING_PLANNER_FEE_INCL_VAT_UT']/100 * df['RETIREMENT_FUND_VALUE'])
    df['ONGOING_PLANNER_FEE_INCL_VAT_LA_AND_LAP'] = round(df['ONGOING_PLANNER_FEE_INCL_VAT_LA_AND_LAP']/100 * df['RETIREMENT_FUND_VALUE'])
    df['SA_EQUITY_UNIT_TRUST'] = round(df['SA_EQUITY_UNIT_TRUST']/100 * df['RETIREMENT_FUND_VALUE'])
    df['SA_BOND_UNIT_TRUST'] = round(df['SA_BOND_UNIT_TRUST']/100 * df['RETIREMENT_FUND_VALUE'])
    df['SA_CASH_UNIT_TRUST'] = round(df['SA_CASH_UNIT_TRUST']/100 * df['RETIREMENT_FUND_VALUE'])
    df['INTERNATIONAL_EQUITY_UNIT_TRUST'] = round(df['INTERNATIONAL_EQUITY_UNIT_TRUST']/100 * df['RETIREMENT_FUND_VALUE'])
    df['INTERNATIONAL_BOND_UNIT_TRUST'] = round(df['INTERNATIONAL_BOND_UNIT_TRUST']/100 * df['RETIREMENT_FUND_VALUE'])
    df['INTERNATIONAL_CASH_UNIT_TRUST'] = round(df['INTERNATIONAL_CASH_UNIT_TRUST']/100 * df['RETIREMENT_FUND_VALUE'])
    df['SA_EQUITY_LAP'] = round(df['SA_EQUITY_LAP']/100 * df['RETIREMENT_FUND_VALUE'])
    df['SA_BOND_LAP'] = round(df['SA_BOND_LAP']/100 * df['RETIREMENT_FUND_VALUE'])
    df['SA_CASH_LAP'] = round(df['SA_CASH_LAP']/100 * df['RETIREMENT_FUND_VALUE'])
    df['INTERNATIONAL_EQUITY_LAP'] = round(df['INTERNATIONAL_EQUITY_LAP']/100 * df['RETIREMENT_FUND_VALUE'])
    df['INTERNATIONAL_BOND_LAP'] = round(df['INTERNATIONAL_BOND_LAP']/100 * df['RETIREMENT_FUND_VALUE'])
    df['INTERNATIONAL_CASH_LAP'] = round(df['INTERNATIONAL_CASH_LAP']/100 * df['RETIREMENT_FUND_VALUE'])
    df['LAP_EAC_PA_INCL_VAT'] = round(df['LAP_EAC_PA_INCL_VAT']/100 * df['RETIREMENT_FUND_VALUE'])
    df['UNIT_TRUST_EAC_PA_INCL_VAT'] = round(df['UNIT_TRUST_EAC_PA_INCL_VAT']/100 * df['RETIREMENT_FUND_VALUE'])
    return df

In [None]:
df_train = get_real_values(df_train)
df_test = get_real_values(df_test)
df_validation = get_real_values(df_validation)

In [None]:
X_train = df_train.drop(['RETIREMENT_FUND_VALUE'], axis=1)
X_test = df_test.drop(['RETIREMENT_FUND_VALUE'], axis=1)
y_train = df_train['RETIREMENT_FUND_VALUE']
y_test = df_test['RETIREMENT_FUND_VALUE']

In [None]:
X_train2 = X_train.copy()

scaler = StandardScaler() # instantiate the scaler function
X_train = scaler.fit_transform(X_train) 
# convert the scaled predictor values into a dataframe
X_train = pd.DataFrame(X_train,columns=X_train2.columns)
X_train.shape

(23944, 25)

In [None]:
X_test2 = X_test.copy()

# instantiate the scaler function
X_test = scaler.transform(X_test) 
# convert the scaled predictor values into a dataframe
X_test = pd.DataFrame(X_test,columns=X_test2.columns)
X_test.shape

(3420, 25)

In [None]:
!pip install catboost

In [None]:
#===========================================================================
# set up our regressor. Today we shall be using the xgboost
#===========================================================================
import xgboost as xg
xgb = xg.XGBRegressor()

#===========================================================================
# perform a scikit-learn Recursive Feature Elimination (RFE)
#===========================================================================
from sklearn.feature_selection import RFE
# here we want only one final feature, we do this to produce a ranking
n_features_to_select = 1
rfe = RFE(xgb, n_features_to_select=n_features_to_select)
rfe.fit(X_train, y_train)

#===========================================================================
# now print out the features in order of ranking
#===========================================================================
from operator import itemgetter
features = X_train.columns.to_list()
for x, y in (sorted(zip(rfe.ranking_ , features), key=itemgetter(0))):
    print(x, y)

In [None]:
#===========================================================================
# ok, this time let's choose the top 10 featues and use them for the model
#===========================================================================
n_features_to_select = 13

rfe = RFE(xgb, n_features_to_select=n_features_to_select)
rfe.fit(X_train, y_train)

#===========================================================================
# use the model to predict the prices for the test data
#===========================================================================
predictions = rfe.predict(X_test)

In [None]:
print(f'R2 For Random Forest Regressor:  {r2_score(y_test, predictions)}')

def rmse(y_test, y_predict):
    mse = mean_squared_error(y_test, y_predict)
    rmse = mse**0.5
    return rmse
print(f'Random Forest Regressor RMSE:  {rmse(y_test, predictions)}')

In [None]:
predictions1 = rfe.predict(X_train)
print(f'Random Forest Regressor RMSE:  {rmse(y_train, predictions1)}')
print(f'R2 For Random Forest Regressor:  {r2_score(y_train, predictions1)}')

## Hyper Parameter Tunning

**XGBoost Regressor Hyper Parameter Tunning**

In [None]:
from sklearn.model_selection import GridSearchCV
# Define the grid of hyperparameters to search

param_grid = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [.03, 0.05, .07], #so called `eta` value
              'max_depth': [5, 6, 7, 10],
              'min_child_weight': [2, 3, 4],
              'booster': ['gblinear', 'gbtree'],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [100, 500, 900, 1100, 1500]}


xgb_grid = GridSearchCV(xgb1,
                        param_grid,
                        cv = 10,
                        n_jobs = 5,
                        verbose=True)

In [None]:
xgb_grid.fit(X_train,y_train)

In [None]:
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)