In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from scipy import stats
from scipy.stats import mstats
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

  import pandas.util.testing as tm


In [2]:
data = pd.read_excel('Final dataset for real3.xlsx', index_col = 0)

In [3]:
types = data.dtypes
print(types)

Country ISO code        object
Nr. of Tax Treaties      int64
NACE Rev. 2            float64
Operating revenue      float64
Number of employees    float64
PBT                    float64
Total assets           float64
ROA                    float64
No of subsidiaries       int64
Solvency ratio         float64
Long term debt         float64
Debtors                float64
Current liabilities    float64
Size                    object
Accounting practice     object
Degree                 float64
Closeness              float64
Betweenness            float64
Eigenvector            float64
Hubs                   float64
ETR                    float64
dtype: object


In [23]:
X = data[["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities","Degree", "Closeness", "Betweenness", "Eigenvector"]]
y = data["ETR"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state= 2031996, shuffle = True)

X_train_con = X_train[["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]]
X_train_na = X_train[["Degree", "Closeness", "Betweenness", "Eigenvector"]]

scaler_con = StandardScaler()
scaler_con.fit(X_train_con)
scaler_na = StandardScaler()
scaler_na.fit(X_train_na)
StandardScaler()
X_train_con = scaler_con.transform(X_train_con)

X_train_na = pd.DataFrame(scaler_na.transform(X_train_na), index = X_train.index)
X_train_na.columns = ["Degree", "Closeness", "Betweenness", "Eigenvector"]

iim=IterativeImputer(
initial_strategy='median',
max_iter=10,
missing_values= np.nan,
random_state=2031996)

imputed_X_train_con = pd.DataFrame(iim.fit_transform(X_train_con), index = X_train.index)
imputed_X_train_con.columns = ["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]

drop_na = pd.concat([imputed_X_train_con, X_train_na, y_train], axis = 1)
drop_na = drop_na.dropna()
drop_na = pd.DataFrame(drop_na)

X_train_final = drop_na.loc[:,:"Eigenvector"]
y_train_final = drop_na['ETR']

X_test_con = X_test[["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]]
X_test_na = X_test[["Degree", "Closeness", "Betweenness", "Eigenvector"]]

X_test_con = scaler_con.transform(X_test_con)

X_test_na = pd.DataFrame(scaler_na.transform(X_test_na), index = X_test.index)
X_test_na.columns = ["Degree", "Closeness", "Betweenness", "Eigenvector"]


imputed_X_test_con = pd.DataFrame(iim.transform(X_test_con), index = X_test.index)
imputed_X_test_con.columns = ["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]

drop_na = pd.concat([imputed_X_test_con, X_test_na, y_test], axis = 1)
drop_na = drop_na.dropna()
drop_na = pd.DataFrame(drop_na)

X_test_final = drop_na.loc[:,:"Eigenvector"]
y_test_final = drop_na['ETR']

print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)
print('Imputed' , X_train_final.shape, y_train_final.shape)

Train (16986, 13) (16986,)
Test (4247, 13) (4247,)
Imputed (14904, 13) (14904,)


In [22]:
def winsorize(dataframe, upper, lower, column_to_winsor):
    dataframe = dataframe.sort_values(column_to_winsor, ascending=False, ignore_index = True)
    amount = dataframe[column_to_winsor].count()
    up = int(amount * (upper/100))
    low = int(amount * (lower/100))
    data_winsor = dataframe.loc[low:up]
    return data_winsor

data_winsor = winsorize(data, 99,  1, 'ETR') 
data_winsor = winsorize(data_winsor, 99, 1, "Operating revenue")
data_winsor = winsorize(data_winsor, 99, 1, "Total assets")
data_winsor = winsorize(data_winsor, 99, 1, "Current liabilities")
data_winsor = winsorize(data_winsor, 99, 1, "Betweenness")
data_winsor = winsorize(data_winsor, 99, 1, "Long term debt")
data_winsor = winsorize(data_winsor, 99, 1, "PBT")

X_winsor = data_winsor[["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities","Degree", "Closeness", "Betweenness", "Eigenvector"]]
y_winsor = data_winsor["ETR"]

X_train_winsor, X_test_winsor, y_train_winsor, y_test_winsor = train_test_split(X_winsor, y_winsor, test_size=0.20, random_state= 2031996, shuffle = True)

X_train_con_winsor = X_train_winsor[["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]]
X_train_na_winsor = X_train_winsor[["Degree", "Closeness", "Betweenness", "Eigenvector"]]

scaler_con_winsor = StandardScaler()
scaler_con_winsor.fit(X_train_con_winsor)
scaler_na_winsor = StandardScaler()
scaler_na_winsor.fit(X_train_na_winsor)
StandardScaler()
X_train_con_winsor = scaler_con_winsor.transform(X_train_con_winsor)

X_train_na_winsor = pd.DataFrame(scaler_na_winsor.transform(X_train_na_winsor), index = X_train_winsor.index)
X_train_na_winsor.columns = ["Degree", "Closeness", "Betweenness", "Eigenvector"]

iim=IterativeImputer(
initial_strategy='median',
max_iter=10,
missing_values= np.nan,
random_state=2031996)

imputed_X_train_con_winsor = pd.DataFrame(iim.fit_transform(X_train_con_winsor), index = X_train_winsor.index)
imputed_X_train_con_winsor.columns = ["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]

drop_na_winsor = pd.concat([imputed_X_train_con_winsor, X_train_na_winsor, y_train_winsor], axis = 1)
drop_na_winsor = drop_na_winsor.dropna()
drop_na_winsor = pd.DataFrame(drop_na_winsor)

X_train_final_winsor = drop_na_winsor.loc[:,:'Eigenvector']
y_train_final_winsor = drop_na_winsor['ETR']

X_test_con_winsor = X_test_winsor[["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]]
X_test_na_winsor = X_test_winsor[["Degree", "Closeness", "Betweenness", "Eigenvector"]]

X_test_con_winsor = scaler_con_winsor.transform(X_test_con_winsor)

X_test_na_winsor = pd.DataFrame(scaler_na_winsor.transform(X_test_na_winsor), index = X_test_winsor.index)
X_test_na_winsor.columns = ["Degree", "Closeness", "Betweenness", "Eigenvector"]

imputed_X_test_con_winsor = pd.DataFrame(iim.transform(X_test_con_winsor), index = X_test_winsor.index)
imputed_X_test_con_winsor.columns = ["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]

drop_na_winsor = pd.concat([imputed_X_test_con_winsor, X_test_na_winsor, y_test_winsor], axis = 1)
drop_na_winsor = pd.DataFrame(drop_na_winsor)

X_test_final_winsor = drop_na_winsor.loc[:,:'Eigenvector']
y_test_final_winsor = drop_na_winsor['ETR']

print('Train', X_train_winsor.shape, y_train_winsor.shape)
print('Test', X_test_winsor.shape, y_test_winsor.shape)
print('Imputed' , X_train_final_winsor.shape, y_train_final_winsor.shape)
print('Imputed' , X_test_final_winsor.shape, y_test_final_winsor.shape)

Train (11108, 13) (11108,)
Test (2778, 13) (2778,)
Imputed (11108, 13) (11108,)
Imputed (2778, 13) (2778,)


In [21]:
def winsorize(dataframe, upper, lower, column_to_winsor):
    dataframe = dataframe.sort_values(column_to_winsor, ascending=False, ignore_index = True)
    amount = dataframe[column_to_winsor].count()
    up = int(amount * (upper/100))
    low = int(amount * (lower/100))
    data_winsor = dataframe.loc[low:up]
    return data_winsor

data_winsor2 = winsorize(data, 97.5,  4, 'ETR') 
data_winsor2 = winsorize(data_winsor2, 100,  4, "Operating revenue")
data_winsor2 = winsorize(data_winsor2, 100,  4, "Total assets")
data_winsor2 = winsorize(data_winsor2, 100,  4, "Current liabilities")
data_winsor2 = winsorize(data_winsor2, 100,  4, "Betweenness")
data_winsor2 = winsorize(data_winsor2, 100,  4, "Long term debt")
data_winsor2 = winsorize(data_winsor2, 97.5,  2.5, "PBT")

X_winsor2 = data_winsor2[["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities","Degree", "Closeness", "Betweenness", "Eigenvector"]]
y_winsor2 = data_winsor2["ETR"]

X_train_winsor2, X_test_winsor2, y_train_winsor2, y_test_winsor2 = train_test_split(X_winsor2, y_winsor2, test_size=0.20, random_state= 2031996, shuffle = True)

X_train_con_winsor2 = X_train_winsor2[["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]]
X_train_na_winsor2 = X_train_winsor2[["Degree", "Closeness", "Betweenness", "Eigenvector"]]

scaler_con_winsor2 = StandardScaler()
scaler_con_winsor2.fit(X_train_con_winsor2)
scaler_na_winsor2 = StandardScaler()
scaler_na_winsor2.fit(X_train_na_winsor2)
StandardScaler()
X_train_con_winsor2 = scaler_con_winsor2.transform(X_train_con_winsor2)

X_train_na_winsor2 = pd.DataFrame(scaler_na_winsor2.transform(X_train_na_winsor2), index = X_train_winsor2.index)
X_train_na_winsor2.columns = ["Degree", "Closeness", "Betweenness", "Eigenvector"]

iim=IterativeImputer(
initial_strategy='median',
max_iter=10,
missing_values= np.nan,
random_state=2031996)

imputed_X_train_con_winsor2 = pd.DataFrame(iim.fit_transform(X_train_con_winsor2), index = X_train_winsor2.index)
imputed_X_train_con_winsor2.columns = ["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]

drop_na_winsor2 = pd.concat([imputed_X_train_con_winsor2, X_train_na_winsor2, y_train_winsor2], axis = 1)
drop_na_winsor2 = drop_na_winsor2.dropna()
drop_na_winsor2 = pd.DataFrame(drop_na_winsor2)

X_train_final_winsor2 = drop_na_winsor2.loc[:,:'Eigenvector']
y_train_final_winsor2 = drop_na_winsor2['ETR']

X_test_con_winsor2 = X_test_winsor2[["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]]
X_test_na_winsor2 = X_test_winsor2[["Degree", "Closeness", "Betweenness", "Eigenvector"]]

X_test_con_winsor2 = scaler_con_winsor2.transform(X_test_con_winsor2)

X_test_na_winsor2 = pd.DataFrame(scaler_na_winsor2.transform(X_test_na_winsor2), index = X_test_winsor2.index)
X_test_na_winsor2.columns = ["Degree", "Closeness", "Betweenness", "Eigenvector"]

imputed_X_test_con_winsor2 = pd.DataFrame(iim.transform(X_test_con_winsor2), index = X_test_winsor2.index)
imputed_X_test_con_winsor2.columns = ["Operating revenue","Number of employees",
          "PBT","Total assets","ROA","Solvency ratio","Long term debt","Debtors",
          "Current liabilities"]

drop_na_winsor2 = pd.concat([imputed_X_test_con_winsor2, X_test_na_winsor2, y_test_winsor2], axis = 1)
drop_na_winsor2 = pd.DataFrame(drop_na_winsor2)

X_test_final_winsor2 = drop_na_winsor2.loc[:,:'Eigenvector']
y_test_final_winsor2 = drop_na_winsor2['ETR']

print('Train', X_train_winsor2.shape, y_train_winsor2.shape)
print('Test', X_test_winsor2.shape, y_test_winsor2.shape)
print('Imputed' , X_train_final_winsor2.shape, y_train_final_winsor2.shape)
print('Imputed' , X_test_final_winsor2.shape, y_test_final_winsor2.shape)

Train (8877, 13) (8877,)
Test (2220, 13) (2220,)
Imputed (8876, 13) (8876,)
Imputed (2220, 13) (2220,)


In [24]:
reg = LinearRegression().fit(X_train_final, y_train_final)
reg.score(X_test_final, y_test_final)
y_pred_final = reg.predict(X_test_final)
mae = mean_absolute_error(y_test_final, y_pred_final)
mse = mean_squared_error(y_test_final, y_pred_final)
rmse = np.sqrt(mse)
d_nonan = {'With outliers': [mse, rmse, mae]}
df_nonan = pd.DataFrame(d_nonan, index = ['MSE','RMSE','MAE'])
print(df_nonan)

      With outliers
MSE   144913.925597
RMSE     380.675617
MAE       78.948933


In [25]:
reg = LinearRegression().fit(X_train_final_winsor, y_train_final_winsor)
reg.score(X_test_final_winsor, y_test_final_winsor)
y_pred_final_winsor = reg.predict(X_test_final_winsor)
mae_winsor = mean_absolute_error(y_test_final_winsor, y_pred_final_winsor)
mse_winsor = mean_squared_error(y_test_final_winsor, y_pred_final_winsor)
rmse_winsor = np.sqrt(mse_winsor)
d_winsor = {'1% outliers removed': [mse_winsor, rmse_winsor, mae_winsor]}
df_winsor = pd.DataFrame(d_winsor, index = ['MSE','RMSE','MAE'])
print(df_winsor)

      1% outliers removed
MSE            856.055934
RMSE            29.258434
MAE             17.516713


In [26]:
reg = LinearRegression().fit(X_train_final_winsor2, y_train_final_winsor2)
reg.score(X_test_final_winsor2, y_test_final_winsor2)
y_pred_final_winsor2 = reg.predict(X_test_final_winsor2)
mae_winsor2 = mean_absolute_error(y_test_final_winsor2, y_pred_final_winsor2)
mse_winsor2 = mean_squared_error(y_test_final_winsor2, y_pred_final_winsor2)
rmse_winsor2 = np.sqrt(mse_winsor2)
d_winsor2 = {'5% outliers removed': [mse_winsor2, rmse_winsor2, mae_winsor2]}
df_winsor2 = pd.DataFrame(d_winsor2, index = ['MSE','RMSE','MAE'])
print(df_winsor2)

      5% outliers removed
MSE            275.178606
RMSE            16.588508
MAE             12.730574


In [27]:
results_LR = pd.concat([df_nonan, df_winsor, df_winsor2], axis = 1)
print(results_LR)

      With outliers  1% outliers removed  5% outliers removed
MSE   144913.925597           856.055934           275.178606
RMSE     380.675617            29.258434            16.588508
MAE       78.948933            17.516713            12.730574


In [28]:
results_LR.to_excel("results_LR_with_NA.xlsx")