In [1]:
# Computations
import numpy as np
import pandas as pd

# sklearn
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import StackingRegressor, GradientBoostingRegressor, RandomForestRegressor,\
                             BaggingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.compose import make_column_transformer
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config('diagram')

# Visualisation libraries

## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output

## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})

## progressbar
import progressbar

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline


<div class="alert alert-block alert-info">
<font size="+3"><b>
Liver Disorders Dataset
</b></font>
</div>

In this article, we analyze the 
[Liver Disorders Dataset](https://archive.ics.uci.edu/ml/datasets/liver+disorders) from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php). 


<img src='https://www.niddk.nih.gov/-/media/Images/Research-Areas/liver_sized.jpg?h=300&la=en&w=200&hash=3D132E712774BA63344950C19D492C4A' height='500' align="center"/>

**Picture Source**: [niddk.nih.gov](https://www.niddk.nih.gov/about-niddk/research-areas/liver-disease)


## Dataset Information:
The first five variables are all blood tests thought to be sensitive to liver disorders that might arise from excessive alcohol consumption. Each line in the dataset constitutes the record of a single male individual.

Important note: The 7th field (selector) has been widely misinterpreted in the past as a dependent variable representing the presence or absence of a liver disorder. This is incorrect [1]. The 7th field was created by BUPA researchers as a train/test selector. It is not suitable as a dependent variable for classification. The dataset does not contain any variable representing the presence or absence of a liver disorder. Researchers who wish to use this dataset as a classification benchmark should follow the method used in experiments by the donor (Forsyth & Rada, 1986, Machine learning: applications in expert systems and information retrieval) and others (e.g. Turney, 1995, Cost-sensitive classification: Empirical evaluation of a hybrid genetic decision tree induction algorithm), who used the 6th field (drinks), after dichotomizing, as a dependent variable for classification. Because of widespread misinterpretation in the past, researchers should take care to state their method clearly.



## Attribute Information:

| Attribute | Information                                                          |
|-----------|----------------------------------------------------------------------|
| MCV       | Mean corpuscular volume                                              |
| AlkPhos   | Alkaline Phosphotase                                                 |
| Sgpt      | Alamine Aminotransferase                                             |
| Sgot      | Aspartate Aminotransferase                                           |
| GammaGT   | Gamma-Glutamyl Transpeptidase                                        |
| Drinks    | Number of half-pint equivalents of alcoholic beverages drunk per day |
| Selector  | Field used to split data into two sets                               |

In [2]:
Data = np.genfromtxt('liver-disorders/bupa.data', delimiter=',', dtype = int)
Attributes = ['MCV', 'AlkPhos', 'Sgpt', 'Sgot', 'GammaGT', 'Drinks', 'Selector']
Data = pd.DataFrame(data = Data, columns = Attributes)
display(Data.head())

Unnamed: 0,MCV,AlkPhos,Sgpt,Sgot,GammaGT,Drinks,Selector
0,85,92,45,27,31,0,1
1,85,64,59,32,23,0,2
2,86,54,33,16,54,0,2
3,91,78,34,24,36,0,2
4,87,70,12,28,10,0,2


# Problem Description

In this article, the dependent variable is the number of drinks. Note that, Selector column is intended to split the data into train and test subsets for one particular experiment.

# Train and Test sets

In [3]:
def DataSize(Inp):
    return pd.DataFrame({'Number of Instances': [Inp.shape[0]], 'Number of Attributes': [Inp.shape[1]]}).style.hide_index()
def Header(Text, L = 100, C1 = Back.BLUE, C2 = Fore.BLUE):
    print(C1 + Fore.WHITE + Style.NORMAL + Text + Style.RESET_ALL + ' ' + C2 +
          Style.NORMAL +  (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = Fore.BLUE): print(C + Style.NORMAL + L*'=' + Style.RESET_ALL)

    
Target = 'Drinks'

# X and y sets
X = Data.drop(columns = 'Selector')
y = X.pop(Target)

# Train Set
Header(Text = 'Train')
Train = Data.loc[Data.Selector == 2].drop(columns = 'Selector').reset_index(drop = True)
display(DataSize(Train))
X_train = Train.copy()
y_train = X_train.pop(Target)

# Test Set
Header(Text = 'Test', C1 = Back.GREEN, C2 = Fore.GREEN)
Test = Data.loc[Data.Selector == 1].drop(columns = 'Selector').reset_index(drop = True)
display(DataSize(Test))
X_test = Test.copy()
y_test = X_test.pop(Target)
Line()

display(pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
               'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).set_index('Set').T)



Number of Instances,Number of Attributes
200,6




Number of Instances,Number of Attributes
145,6




Set,X_train,X_test,y_train,y_test
Shape,"(200, 5)","(145, 5)","(200,)","(145,)"


# Regressors

In this section, we test a number of efficient scikit-learn regressors. Then from those that have performed well, a stacked model can be formed. In particular, we use the following models:

| Regressor                   | Link                                                                                                                                                               |
|-----------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| AdaBoost Regressor          | [sklearn.ensemble.AdaBoostRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html#sklearn.ensemble.AdaBoostRegressor) |
| Bagging Regressor          | [sklearn.ensemble.BaggingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html)                                      |
| Decision Tree Regressor     | [sklearn.tree.DecisionTreeRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)                                    |
| Gradient Boosting Regressor | [sklearn.ensemble.GradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)                    |
| MLP Regressor               | [sklearn.neural_network.MLPRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html)                                  |
| Random Forest Regressor     | [sklearn.ensemble.RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)                            |

In [41]:
import xgboost as xgb
data_dmatrix = xgb.DMatrix(data=X,label=y)

xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [42]:
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)



In [43]:
rmse = np.sqrt(metrics.mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 4.078785


In [44]:
 metrics.r2_score(y_test, preds)

-0.007913061479751349

In [36]:
# creating an empty dictionary
params = {}
params["silent"] = 1
params["eta"] = 0.11
params["min_child_weight"] = 11
params["scale_pos_weight"] = 0.7
params["seed"] = 0
params["base_score"] = 1800
params["subsample"] = 0.6
params["colsample_bytree"] = 0.6

parm_list = list(params.items())
Steps = 1500

In [37]:
xgtrain = xgb.DMatrix(X_train, label=y_train)
model = xgb.train(parm_list, xgtrain, Steps)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [40]:
Predictions = model.predict(xgb.DMatrix(X_test))
display(pd.DataFrame(np.array([[metrics.mean_squared_error(y_test, Predictions), metrics.r2_score(y_test, Predictions)]]),
                       columns=['MSE', 'Variance score']))
print('Minimum = %f, Maximum = %f'% (Predictions.min(), Predictions.max()))

Unnamed: 0,MSE,Variance score
0,16.78403,-0.016852


Minimum = -2.845947, Maximum = 10.615723


***
# References
1. McDermott, J. and Forsyth, R.S., 2016. Diagnosing a disorder in a classification benchmark. Pattern Recognition Letters, 73, pp.41-43.
***