In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from scipy import stats
from scipy.stats import mstats
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

In [None]:
data = pd.read_excel('Final dataset for real3.xlsx', index_col = 0)
data = data.reset_index()

In [None]:
data['Country ISO code'] = data['Country ISO code'].astype('category')
data['Size'] = data['Size'].astype('category')
data['Accounting practice'] = data['Accounting practice'].astype('category')
data['NACE Rev. 2'] = data['NACE Rev. 2'].astype('category')

X = data[["Nr. of Tax Treaties","NACE Rev. 2","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities", "Size","Accounting practice","Degree","Closeness","Betweenness","Eigenvector"]]
y = data["ETR"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state= 2031996, shuffle = True)

X_train_con = X_train[["Nr. of Tax Treaties","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]]
X_train_na = X_train[["Degree", "Closeness", "Betweenness", "Eigenvector"]]
X_train_cat = X_train[['NACE Rev. 2', 'Size', 'Accounting practice']]

scaler_con = StandardScaler()
scaler_con.fit(X_train_con)
scaler_na = StandardScaler()
scaler_na.fit(X_train_na)
StandardScaler()
X_train_con = scaler_con.transform(X_train_con)

X_train_na = pd.DataFrame(scaler_na.transform(X_train_na), index = X_train.index)
X_train_na.columns = ["Degree", "Closeness", "Betweenness", "Eigenvector"]

iim=IterativeImputer(
initial_strategy='median',
max_iter=10,
missing_values= np.nan,
random_state=2031996)

imputed_X_train_con = pd.DataFrame(iim.fit_transform(X_train_con), index = X_train.index)
imputed_X_train_con.columns = ["Nr. of Tax Treaties","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]
imputed_X_train_cat = pd.get_dummies(X_train_cat)

drop_na = pd.concat([imputed_X_train_con, imputed_X_train_cat, X_train_na, y_train], axis = 1)
drop_na = drop_na.dropna()
drop_na = pd.DataFrame(drop_na)

X_train_final = drop_na.loc[:,:"Eigenvector"]
y_train_final = drop_na['ETR']

X_test_con = X_test[["Nr. of Tax Treaties","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]]
X_test_na = X_test[["Degree", "Closeness", "Betweenness", "Eigenvector"]]
X_test_cat = X_test[['NACE Rev. 2', 'Size', 'Accounting practice']]

X_test_con = scaler_con.transform(X_test_con)

X_test_na = pd.DataFrame(scaler_na.transform(X_test_na), index = X_test.index)
X_test_na.columns = ["Degree", "Closeness", "Betweenness", "Eigenvector"]

imputed_X_test_con = pd.DataFrame(iim.transform(X_test_con), index = X_test.index)
imputed_X_test_con.columns = ["Nr. of Tax Treaties","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]
imputed_X_test_cat = pd.get_dummies(X_test_cat)

drop_na = pd.concat([imputed_X_test_con, imputed_X_test_cat, X_test_na, y_test], axis = 1)
drop_na = drop_na.dropna()
drop_na = pd.DataFrame(drop_na)

X_test_final = drop_na.loc[:,:"Eigenvector"]
y_test_final = drop_na['ETR']

print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)
print('Imputed' , X_train_final.shape, y_train_final.shape)
print('Imputed_test' , X_test_final.shape, y_test_final.shape)

In [None]:
def winsorize(dataframe, upper, lower, column_to_winsor):
    dataframe = dataframe.sort_values(column_to_winsor, ascending=False, ignore_index = True)
    amount = dataframe[column_to_winsor].count()
    up = int(amount * (upper/100))
    low = int(amount * (lower/100))
    data_winsor = dataframe.loc[low:up]
    return data_winsor

data_winsor = winsorize(data, 99,  1, 'ETR') 
data_winsor = winsorize(data_winsor, 99, 1, "Operating revenue")
data_winsor = winsorize(data_winsor, 99, 1, "Total assets")
data_winsor = winsorize(data_winsor, 99, 1, "Current liabilities")
data_winsor = winsorize(data_winsor, 99, 1, "Betweenness")
data_winsor = winsorize(data_winsor, 99, 1, "Long term debt")
data_winsor = winsorize(data_winsor, 99, 1, "PBT")

X_winsor = data_winsor[["Nr. of Tax Treaties","NACE Rev. 2","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities", "Size","Accounting practice","Degree","Closeness","Betweenness","Eigenvector"]]
y_winsor = data_winsor["ETR"]

X_train_winsor, X_test_winsor, y_train_winsor, y_test_winsor = train_test_split(X_winsor, y_winsor, test_size=0.20, random_state= 2031996, shuffle = True)

X_train_con_winsor = X_train_winsor[["Nr. of Tax Treaties","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]]
X_train_na_winsor = X_train_winsor[["Degree", "Closeness", "Betweenness", "Eigenvector"]]
X_train_cat_winsor = X_train_winsor[['NACE Rev. 2', 'Size', 'Accounting practice']]

scaler_con_winsor = StandardScaler()
scaler_con_winsor.fit(X_train_con_winsor)
scaler_na_winsor = StandardScaler()
scaler_na_winsor.fit(X_train_na_winsor)
StandardScaler()
X_train_con_winsor = scaler_con_winsor.transform(X_train_con_winsor)

X_train_na_winsor = pd.DataFrame(scaler_na_winsor.transform(X_train_na_winsor), index = X_train_winsor.index)
X_train_na_winsor.columns = ["Degree", "Closeness", "Betweenness", "Eigenvector"]

iim=IterativeImputer(
initial_strategy='median',
max_iter=10,
missing_values= np.nan,
random_state=2031996)

imputed_X_train_con_winsor = pd.DataFrame(iim.fit_transform(X_train_con_winsor), index = X_train_winsor.index)
imputed_X_train_con_winsor.columns = ["Nr. of Tax Treaties","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]
imputed_X_train_cat_winsor = pd.get_dummies(X_train_cat_winsor)

drop_na_winsor = pd.concat([imputed_X_train_con_winsor, imputed_X_train_cat_winsor, X_train_na_winsor, y_train_winsor], axis = 1)
drop_na_winsor = drop_na_winsor.dropna()
drop_na_winsor = pd.DataFrame(drop_na_winsor)

X_train_final_winsor = drop_na_winsor.loc[:,:'Eigenvector']
y_train_final_winsor = drop_na_winsor['ETR']

X_test_con_winsor = X_test_winsor[["Nr. of Tax Treaties","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]]
X_test_na_winsor = X_test_winsor[["Degree", "Closeness", "Betweenness", "Eigenvector"]]
X_test_cat_winsor = X_test_winsor[['NACE Rev. 2', 'Size', 'Accounting practice']]

X_test_con_winsor = scaler_con_winsor.transform(X_test_con_winsor)

X_test_na_winsor = pd.DataFrame(scaler_na_winsor.transform(X_test_na_winsor), index = X_test_winsor.index)
X_test_na_winsor.columns = ["Degree", "Closeness", "Betweenness", "Eigenvector"]

imputed_X_test_con_winsor = pd.DataFrame(iim.transform(X_test_con_winsor), index = X_test_winsor.index)
imputed_X_test_con_winsor.columns = ["Nr. of Tax Treaties","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]
imputed_X_test_cat_winsor = pd.get_dummies(X_test_cat_winsor)

drop_na_winsor = pd.concat([imputed_X_test_con_winsor, imputed_X_test_cat_winsor, X_test_na_winsor, y_test_winsor], axis = 1)
drop_na_winsor = pd.DataFrame(drop_na_winsor)

X_test_final_winsor = drop_na_winsor.loc[:,:'Eigenvector']
y_test_final_winsor = drop_na_winsor['ETR']

print('Train', X_train_winsor.shape, y_train_winsor.shape)
print('Test', X_test_winsor.shape, y_test_winsor.shape)
print('Imputed' , X_train_final_winsor.shape, y_train_final_winsor.shape)
print('Imputed' , X_test_final_winsor.shape, y_test_final_winsor.shape)

In [None]:
def winsorize(dataframe, upper, lower, column_to_winsor):
    dataframe = dataframe.sort_values(column_to_winsor, ascending=False, ignore_index = True)
    amount = dataframe[column_to_winsor].count()
    up = int(amount * (upper/100))
    low = int(amount * (lower/100))
    data_winsor = dataframe.loc[low:up]
    return data_winsor

data_winsor2 = winsorize(data, 97.5,  4, 'ETR') 
data_winsor2 = winsorize(data_winsor2, 100,  4, "Operating revenue")
data_winsor2 = winsorize(data_winsor2, 100,  4, "Total assets")
data_winsor2 = winsorize(data_winsor2, 100,  4, "Current liabilities")
data_winsor2 = winsorize(data_winsor2, 100,  4, "Betweenness")
data_winsor2 = winsorize(data_winsor2, 100,  4, "Long term debt")
data_winsor2 = winsorize(data_winsor2, 97.5,  2.5, "PBT")

X_winsor2 = data_winsor2[["Nr. of Tax Treaties","NACE Rev. 2","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities", "Size","Accounting practice","Degree","Closeness","Betweenness","Eigenvector"]]
y_winsor2 = data_winsor2["ETR"]

X_train_winsor2, X_test_winsor2, y_train_winsor2, y_test_winsor2 = train_test_split(X_winsor2, y_winsor2, test_size=0.20, random_state= 2031996, shuffle = True)

X_train_con_winsor2 = X_train_winsor2[["Nr. of Tax Treaties","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]]
X_train_na_winsor2 = X_train_winsor2[["Degree", "Closeness", "Betweenness", "Eigenvector"]]
X_train_cat_winsor2 = X_train_winsor2[['NACE Rev. 2', 'Size', 'Accounting practice']]

scaler_con_winsor2 = StandardScaler()
scaler_con_winsor2.fit(X_train_con_winsor2)
scaler_na_winsor2 = StandardScaler()
scaler_na_winsor2.fit(X_train_na_winsor2)
StandardScaler()
X_train_con_winsor2 = scaler_con_winsor2.transform(X_train_con_winsor2)

X_train_na_winsor2 = pd.DataFrame(scaler_na_winsor2.transform(X_train_na_winsor2), index = X_train_winsor2.index)
X_train_na_winsor2.columns = ["Degree", "Closeness", "Betweenness", "Eigenvector"]

iim=IterativeImputer(
initial_strategy='median',
max_iter=10,
missing_values= np.nan,
random_state=2031996)

imputed_X_train_con_winsor2 = pd.DataFrame(iim.fit_transform(X_train_con_winsor2), index = X_train_winsor2.index)
imputed_X_train_con_winsor2.columns = ["Nr. of Tax Treaties","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]
imputed_X_train_cat_winsor2 = pd.get_dummies(X_train_cat_winsor2)

drop_na_winsor2 = pd.concat([imputed_X_train_con_winsor2, imputed_X_train_cat_winsor2, X_train_na_winsor2, y_train_winsor2], axis = 1)
drop_na_winsor2 = drop_na_winsor2.dropna()
drop_na_winsor2 = pd.DataFrame(drop_na_winsor2)

X_train_final_winsor2 = drop_na_winsor2.loc[:,:'Eigenvector']
y_train_final_winsor2 = drop_na_winsor2['ETR']

X_test_con_winsor2 = X_test_winsor2[["Nr. of Tax Treaties","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]]
X_test_na_winsor2 = X_test_winsor2[["Degree", "Closeness", "Betweenness", "Eigenvector"]]
X_test_cat_winsor2 = X_test_winsor2[['NACE Rev. 2', 'Size', 'Accounting practice']]

X_test_con_winsor2 = scaler_con_winsor2.transform(X_test_con_winsor2)

X_test_na_winsor2 = pd.DataFrame(scaler_na_winsor2.transform(X_test_na_winsor2), index = X_test_winsor2.index)
X_test_na_winsor2.columns = ["Degree", "Closeness", "Betweenness", "Eigenvector"]

imputed_X_test_con_winsor2 = pd.DataFrame(iim.transform(X_test_con_winsor2), index = X_test_winsor2.index)
imputed_X_test_con_winsor2.columns = ["Nr. of Tax Treaties","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]
imputed_X_test_cat_winsor2 = pd.get_dummies(X_test_cat_winsor2)

drop_na_winsor2 = pd.concat([imputed_X_test_con_winsor2, imputed_X_test_cat_winsor2, X_test_na_winsor2, y_test_winsor2], axis = 1)
drop_na_winsor2 = pd.DataFrame(drop_na_winsor2)

X_test_final_winsor2 = drop_na_winsor2.loc[:,:'Eigenvector']
y_test_final_winsor2 = drop_na_winsor2['ETR']

print('Train', X_train_winsor2.shape, y_train_winsor2.shape)
print('Test', X_test_winsor2.shape, y_test_winsor2.shape)
print('Imputed' , X_train_final_winsor2.shape, y_train_final_winsor2.shape)
print('Imputed' , X_test_final_winsor2.shape, y_test_final_winsor2.shape)

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
min_samples_split = [2, 20, 50]
min_samples_leaf = [1, 20, 80]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train_final, y_train_final)
best_parameters = rf_random.best_estimator_
print(best_parameters)

In [None]:
rf_winsor = RandomForestRegressor()
rf_random_winsor = RandomizedSearchCV(estimator = rf_winsor, param_distributions = random_grid, n_iter = 200, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random_winsor.fit(X_train_final_winsor, y_train_final_winsor)
best_parameters_winsor = rf_random_winsor.best_estimator_
print(best_parameters_winsor)

In [None]:
rf_winsor2 = RandomForestRegressor()
rf_random_winsor2 = RandomizedSearchCV(estimator = rf_winsor2, param_distributions = random_grid, n_iter = 200, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random_winsor2.fit(X_train_final_winsor2, y_train_final_winsor2)
best_parameters_winsor2 = rf_random_winsor2.best_estimator_
print(best_parameters_winsor2)

In [None]:
n_estimators = [1350, 1550, 1750]
max_features = ['auto', 'sqrt', 'log2']
max_depth = [78, 85, 92]
min_samples_split = [35,50,65]
min_samples_leaf = [50,80,110]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_winsor2 = RandomForestRegressor()
rf_random_winsor2 = GridSearchCV(estimator = rf_winsor2, param_grid = random_grid, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random_winsor2.fit(X_train_final_winsor2, y_train_final_winsor2)
best_parameters_winsor2 = rf_random_winsor2.best_estimator_
print(best_parameters_winsor2)

In [None]:
rf_for_pred = RandomForestRegressor(max_depth=110, max_features='log2', min_samples_leaf=80,
                      min_samples_split=50, n_estimators=200)

rf_for_pred.fit(X_train_final,y_train_final)

y_pred = rf_for_pred.predict(X_test_final)

mae = mean_absolute_error(y_test_final, y_pred)
mse = mean_squared_error(y_test_final, y_pred)
rmse = np.sqrt(mse)
d_nonan = {'With outliers': [mse, rmse, mae]}
df_nonan = pd.DataFrame(d_nonan, index = ['MSE','RMSE','MAE'])
print(df_nonan)

In [None]:
rf_for_pred_winsor = RandomForestRegressor(bootstrap=False, max_depth=60, max_features='sqrt',
                      n_estimators=650)

rf_for_pred_winsor.fit(X_train_final_winsor,y_train_final_winsor)

y_pred_winsor = rf_for_pred_winsor.predict(X_test_final_winsor)

mae_winsor = mean_absolute_error(y_test_final_winsor, y_pred_winsor)
mse_winsor = mean_squared_error(y_test_final_winsor, y_pred_winsor)
rmse_winsor = np.sqrt(mse_winsor)
d_nonan_winsor = {'1% outliers removed': [mse_winsor, rmse_winsor, mae_winsor]}
df_nonan_winsor = pd.DataFrame(d_nonan_winsor, index = ['MSE','RMSE','MAE'])
print(df_nonan_winsor)

In [None]:
rf_for_pred_winsor = RandomForestRegressor(n_estimators=1100)

rf_for_pred_winsor.fit(X_train_final_winsor,y_train_final_winsor)

y_pred_winsor = rf_for_pred_winsor.predict(X_test_final_winsor)

mae_winsor = mean_absolute_error(y_test_final_winsor, y_pred_winsor)
mse_winsor = mean_squared_error(y_test_final_winsor, y_pred_winsor)
rmse_winsor = np.sqrt(mse_winsor)
d_nonan_winsor = {'1% outliers removed': [mse_winsor, rmse_winsor, mae_winsor]}
df_nonan_winsor = pd.DataFrame(d_nonan_winsor, index = ['MSE','RMSE','MAE'])
print(df_nonan_winsor)

In [None]:
rf_for_pred_winsor2 = RandomForestRegressor(bootstrap=False, max_depth=60, max_features='sqrt',
                      n_estimators=650)

rf_for_pred_winsor2.fit(X_train_final_winsor2,y_train_final_winsor2)

y_pred_winsor2 = rf_for_pred_winsor2.predict(X_test_final_winsor2)

mae_winsor2 = mean_absolute_error(y_test_final_winsor2, y_pred_winsor2)
mse_winsor2 = mean_squared_error(y_test_final_winsor2, y_pred_winsor2)
rmse_winsor2 = np.sqrt(mse_winsor2)
d_nonan_winsor2 = {'5% outliers removed': [mse_winsor2, rmse_winsor2, mae_winsor2]}
df_nonan_winsor2 = pd.DataFrame(d_nonan_winsor2, index = ['MSE','RMSE','MAE'])
print(df_nonan_winsor2)

In [None]:
rf_for_pred_winsor2 = RandomForestRegressor(max_depth=85, n_estimators=1550)

rf_for_pred_winsor2.fit(X_train_final_winsor2,y_train_final_winsor2)

y_pred_winsor2 = rf_for_pred_winsor2.predict(X_test_final_winsor2)

mae_winsor2 = mean_absolute_error(y_test_final_winsor2, y_pred_winsor2)
mse_winsor2 = mean_squared_error(y_test_final_winsor2, y_pred_winsor2)
rmse_winsor2 = np.sqrt(mse_winsor2)
d_nonan_winsor2 = {'5% outliers removed': [mse_winsor2, rmse_winsor2, mae_winsor2]}
df_nonan_winsor2 = pd.DataFrame(d_nonan_winsor2, index = ['MSE','RMSE','MAE'])
print(df_nonan_winsor2)

In [None]:
results_LR_RF = pd.concat([df_nonan, df_nonan_winsor, df_nonan_winsor2], axis = 1)
print(results_LR_RF)

In [None]:
results_LR_RF.to_excel("results_RF_NA.xlsx")

In [None]:
feature_list=list(X_train_final.columns)
importances = list(rf_for_pred.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Feature: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
feature_list=list(X_train_final_winsor.columns)
importances = list(rf_for_pred_winsor.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Feature: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
feature_list=list(X_train_final_winsor2.columns)
importances = list(rf_for_pred_winsor2.feature_importances_)
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(feature_list, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Feature: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
feature_importances = pd.DataFrame(feature_importances)
feature_importances

In [None]:
feature_importances.to_excel("feature_importances_RF.xlsx")

In [None]:
from matplotlib.pyplot import figure
y_error = pd.DataFrame(data = [(y_pred_winsor2-y_test_final_winsor2)]).T
y_error = pd.concat([X_test_winsor2, y_error], axis = 1)

cormat = y_error.corr()
round(cormat,3)

ETR_heatmap = cormat['ETR']
ETR_heatmap = pd.DataFrame(ETR_heatmap)
ETR_heatmap.rename(columns = {'ETR':'ETR error'}, inplace = True)
ETR_heatmap = ETR_heatmap.drop('ETR')

figure(figsize=(4, 10), dpi=200)
sns.heatmap(ETR_heatmap, annot = True)

In [None]:
ETR_heatmap.sort_values('ETR error')

In [None]:
from matplotlib.pyplot import figure
X = data[["Nr. of Tax Treaties","NACE Rev. 2","Operating revenue","Number of employees",
          "PBT","Total assets","ROA","No of subsidiaries","Solvency ratio","Long term debt","Debtors",
          "Current liabilities", "Size","Accounting practice","Degree","Closeness","Betweenness","Eigenvector","Hubs"]]
y = data["ETR"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state= 2031996, shuffle = True)

y_error = pd.concat([X_train, y_train], axis = 1)

cormat = y_error.corr()
round(cormat,3)

figure(figsize=(12, 9), dpi=200)
sns.heatmap(cormat)