## The purpose of this notebook: 
- Is to find the best way to impute the lagged variables
- From past experience, this has been helpful with finding categorical encoders. 

Note: when there is a null, this is significant because this means that either the company did not yet exist or the company did not submit financials. Therefore a zero fill would most representative of truth. However, since we are engineering features that are relevant to gaps in financial submission dates, if we get any increase in results by using imputation, we could use this

Results: confirmation of what we logically understood - zero filling data is more representative of reality and this shows up in the modeling

In [1]:
import pandas as pd
import numpy as np
import joblib
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score	
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')
base_dir = #insert
data_dir = base_dir + 'Data/'
credit_df = pd.read_csv(data_dir+'train.csv')

In [2]:
load_path = base_dir + '/Artifacts/col_groups_dct.joblib'
col_grps_dct = joblib.load(load_path)

In [3]:
credit_df.sample()

Unnamed: 0,company_id,payment_note_date,payment_note_amount,financials_date,financials_date-1,financials_date-2,financials_date-3,financials_date-4,revenue,revenue-1,...,profit_margin-1,profit_margin-2,profit_margin-3,profit_margin-4,cash_ratio,cash_ratio-1,cash_ratio-2,cash_ratio-3,cash_ratio-4,Rating
7840,7841,,0.0,Dec-17,Dec-16,Dec-15,Dec-14,Dec-13,17491,10382.0,...,7.2,13.4,21.0,-1.2,405.8,432.9,313.8,312.0,267.0,AAA


In [4]:
cols_to_drop = ['Rating']+col_grps_dct['financials_cols']+['company_id','payment_note_date','payment_note_amount']
X = credit_df.drop(cols_to_drop,axis=1)
y = credit_df['Rating']

y = preprocessing.LabelEncoder().fit_transform(y)


selected_model = XGBClassifier(tree_method = "gpu_hist",single_precision_histogram=True, gpu_id=0)

#impute_methods_list = ['pad','nearest','zero','slinear','quadratic','cubic','spline','barycentric','polynomial','krogh','piecewise_polynomial','pchip','akima','cubicspline']

impute_results_df = pd.DataFrame(columns=['fill_method','AUC', 'F1 macro','F1 weighted', 'Accuracy'])

In [5]:
del col_grps_dct['financials_cols']


## 1. Fill zero 

In [6]:
X_cpy = X.copy()
X_cpy=X_cpy.fillna(0)
X_train, X_test, y_train, y_test = train_test_split(X_cpy, y, test_size=0.2)
model = selected_model.fit(X_train, y_train)
y_pred = model.predict(X_test)
pred_prob = model.predict_proba(X_test)

row = {
    'fill_method': 'zero',
    'AUC': roc_auc_score(y_test, pred_prob, multi_class='ovo', average='macro'),
    'F1 macro': f1_score(y_test, y_pred, average='macro'),
    'F1 weighted': f1_score(y_test, y_pred, average='weighted'),
    'Accuracy': accuracy_score(y_test, y_pred)
}

impute_results_df = impute_results_df.append(row, ignore_index=True)



In [7]:
impute_results_df

Unnamed: 0,fill_method,AUC,F1 macro,F1 weighted,Accuracy
0,zero,0.905147,0.697361,0.740924,0.742818


## Mean fill

In [8]:
X_cpy = X.copy()

for grp_name, grp_cols in col_grps_dct.items():
    rows_mean_dct = X_cpy[grp_cols].apply(lambda row: row.mean(),axis=1).to_dict()
    for col in grp_cols:
        X_cpy[col] = X_cpy[col].fillna(value=rows_mean_dct)
X_cpy

Unnamed: 0,revenue,revenue-1,revenue-2,revenue-3,revenue-4,net_sales,net_sales-1,net_sales-2,net_sales-3,net_sales-4,...,profit_margin,profit_margin-1,profit_margin-2,profit_margin-3,profit_margin-4,cash_ratio,cash_ratio-1,cash_ratio-2,cash_ratio-3,cash_ratio-4
0,5677,8673.0,8532.0,7825.0,5385.0,5677,8663.0,8532.0,7811.0,5385.0,...,14.7,13.6,10.3,16.8,-5.4,395.5,232.4,184.7,236.4,148.1
1,10617,8266.0,9713.0,9428.0,7394.0,10506,8254.0,9452.0,9349.0,7328.0,...,10.9,9.1,15.9,23.1,16.8,229.0,280.7,296.1,234.0,213.0
2,7201,7201.0,7201.0,7201.0,7201.0,7201,7201.0,7201.0,7201.0,7201.0,...,6.0,6.0,6.0,6.0,6.0,128.1,128.1,128.1,128.1,128.1
3,22629,20668.0,24591.0,23754.0,23656.0,22619,20667.0,24215.0,23397.0,23512.0,...,2.3,0.2,2.9,0.1,1.9,97.8,89.2,82.2,72.4,92.9
4,10221,8358.0,5865.0,4038.0,5128.0,10216,8358.0,5823.0,3998.0,5128.0,...,8.6,14.6,-0.4,-0.6,9.7,159.9,183.1,112.5,215.5,134.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10959,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,308.8,310.4,311.9,20.9,5196.2
10960,8710,8710.0,8710.0,8710.0,8710.0,8710,8710.0,8710.0,8710.0,8710.0,...,0.3,0.3,0.3,0.3,0.3,70.0,70.0,70.0,70.0,70.0
10961,6320,6529.0,5803.0,6250.0,4446.0,6320,6529.0,5803.0,5766.0,4412.0,...,20.8,21.6,19.4,17.7,6.1,142.2,110.7,98.3,132.5,60.8
10962,5811,5811.0,5811.0,5811.0,5811.0,5811,5811.0,5811.0,5811.0,5811.0,...,49.4,49.4,49.4,49.4,49.4,380.1,380.1,380.1,380.1,380.1


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_cpy, y, test_size=0.2)
model = selected_model.fit(X_train, y_train)
y_pred = model.predict(X_test)
pred_prob = model.predict_proba(X_test)

row = {
    'fill_method': 'mean',
    'AUC': roc_auc_score(y_test, pred_prob, multi_class='ovo', average='macro'),
    'F1 macro': f1_score(y_test, y_pred, average='macro'),
    'F1 weighted': f1_score(y_test, y_pred, average='weighted'),
    'Accuracy': accuracy_score(y_test, y_pred)
}

impute_results_df = impute_results_df.append(row, ignore_index=True)



In [10]:
impute_results_df

Unnamed: 0,fill_method,AUC,F1 macro,F1 weighted,Accuracy
0,zero,0.905147,0.697361,0.740924,0.742818
1,mean,0.883219,0.663191,0.712569,0.716826


## Pad

In [11]:
X_cpy = X.copy()

for grp_name, grp_cols in col_grps_dct.items():
    X_cpy[grp_cols]=X_cpy[grp_cols].fillna(method='pad',axis=1)
    
X_cpy

Unnamed: 0,revenue,revenue-1,revenue-2,revenue-3,revenue-4,net_sales,net_sales-1,net_sales-2,net_sales-3,net_sales-4,...,profit_margin,profit_margin-1,profit_margin-2,profit_margin-3,profit_margin-4,cash_ratio,cash_ratio-1,cash_ratio-2,cash_ratio-3,cash_ratio-4
0,5677.0,8673.0,8532.0,7825.0,5385.0,5677.0,8663.0,8532.0,7811.0,5385.0,...,14.7,13.6,10.3,16.8,-5.4,395.5,232.4,184.7,236.4,148.1
1,10617.0,8266.0,9713.0,9428.0,7394.0,10506.0,8254.0,9452.0,9349.0,7328.0,...,10.9,9.1,15.9,23.1,16.8,229.0,280.7,296.1,234.0,213.0
2,7201.0,7201.0,7201.0,7201.0,7201.0,7201.0,7201.0,7201.0,7201.0,7201.0,...,6.0,6.0,6.0,6.0,6.0,128.1,128.1,128.1,128.1,128.1
3,22629.0,20668.0,24591.0,23754.0,23656.0,22619.0,20667.0,24215.0,23397.0,23512.0,...,2.3,0.2,2.9,0.1,1.9,97.8,89.2,82.2,72.4,92.9
4,10221.0,8358.0,5865.0,4038.0,5128.0,10216.0,8358.0,5823.0,3998.0,5128.0,...,8.6,14.6,-0.4,-0.6,9.7,159.9,183.1,112.5,215.5,134.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,308.8,310.4,311.9,20.9,5196.2
10960,8710.0,8710.0,8710.0,8710.0,8710.0,8710.0,8710.0,8710.0,8710.0,8710.0,...,0.3,0.3,0.3,0.3,0.3,70.0,70.0,70.0,70.0,70.0
10961,6320.0,6529.0,5803.0,6250.0,4446.0,6320.0,6529.0,5803.0,5766.0,4412.0,...,20.8,21.6,19.4,17.7,6.1,142.2,110.7,98.3,132.5,60.8
10962,5811.0,5811.0,5811.0,5811.0,5811.0,5811.0,5811.0,5811.0,5811.0,5811.0,...,49.4,49.4,49.4,49.4,49.4,380.1,380.1,380.1,380.1,380.1


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_cpy, y, test_size=0.2)
model = selected_model.fit(X_train, y_train)
y_pred = model.predict(X_test)
pred_prob = model.predict_proba(X_test)

row = {
    'fill_method': 'pad',
    'AUC': roc_auc_score(y_test, pred_prob, multi_class='ovo', average='macro'),
    'F1 macro': f1_score(y_test, y_pred, average='macro'),
    'F1 weighted': f1_score(y_test, y_pred, average='weighted'),
    'Accuracy': accuracy_score(y_test, y_pred)
}

impute_results_df = impute_results_df.append(row, ignore_index=True)
impute_results_df



Unnamed: 0,fill_method,AUC,F1 macro,F1 weighted,Accuracy
0,zero,0.905147,0.697361,0.740924,0.742818
1,mean,0.883219,0.663191,0.712569,0.716826
2,pad,0.891379,0.668757,0.712588,0.716826
