In [1]:
!pip install xgboost

Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/1d/e7/5258cb787dc036f419ec57491decf8bfa89ab52c401b08b4b9228e43dc4c/xgboost-0.81-py2.py3-none-win_amd64.whl (7.4MB)


You are using pip version 18.1, however version 19.0.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../data/white-wine.csv')

In [3]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


<img src="../images/icon/ppt-icons.png" alt="Technical-Stuff" style="width: 100px;float:left; margin-right:15px"/>
<br />

### Mini Challenge - 1
***
Split the white-wine dataset into train and test set with `test_size = 0.3`

In [4]:
from sklearn.model_selection import train_test_split as tts

X , y = df.iloc[:,:-1], df.iloc[:,-1]

x_train, x_test, y_train, y_test = tts(X,y,test_size=0.3)

<img src="../images/icon/ppt-icons.png" alt="Technical-Stuff" style="width: 100px;float:left; margin-right:15px"/>
<br />

### Mini Challenge - 2
***
For this challenge, you will have to install `xgboost` library. Import `xgboost` and instantiate `XGBClassifier()` and fit it on the training set

In [5]:
import xgboost as xgb

model1 = xgb.XGBClassifier()

model1.fit(x_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

<img src="../images/icon/ppt-icons.png" alt="Technical-Stuff" style="width: 100px;float:left; margin-right:15px"/>
<br />

### Mini Challenge - 3
***
Predict the model on the test set and check the accuracy score

In [6]:
from sklearn.metrics import accuracy_score

y_pred = model1.predict(x_test)

accuracy_score(y_test,y_pred)

0.5755102040816327

<img src="../images/icon/ppt-icons.png" alt="Technical-Stuff" style="width: 100px;float:left; margin-right:15px"/>
<br />

### Mini Challenge - 4
***
Now let's do some hyperparameter tuning by defining the parameters `'colsample_bytree': np.linspace(0.5, 0.9, 5)`,`'n_estimators':[5, 10]`,`'max_depth': [10, 15, 20, 25]` and then fit them into a GridSearchCV model with `scoring = 'neg_mean_squared_error'`and cross_ validation parameter `cv = 5`<br/><br/>
Feel free to experiment on the hyperparameters with the help of the table mentioned in the **XGBoost Model Configuration** part 

In [7]:
from sklearn.model_selection import GridSearchCV

gbm_param_grid = {
     'colsample_bytree': np.linspace(0.5, 0.9, 5),
     'n_estimators':[5, 10],
     'max_depth': [10, 15, 20, 25]
}

grid_mse = GridSearchCV(estimator = model1, param_grid = gbm_param_grid, scoring = 'neg_mean_squared_error', cv = 5, verbose = 1)

<img src="../images/icon/ppt-icons.png" alt="Technical-Stuff" style="width: 100px;float:left; margin-right:15px"/>
<br />

### Mini Challenge - 5
***
Fit the GridSearchCV model on the train dataset and get the best parametres and the lowest RMSE value

In [8]:
grid_mse.fit(x_train, y_train)
print("Best parameters found: ",grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 200 out of 200 | elapsed:   57.4s finished


Best parameters found:  {'colsample_bytree': 0.5, 'max_depth': 25, 'n_estimators': 10}
Lowest RMSE found:  0.7308424511808784


<img src="../images/icon/ppt-icons.png" alt="Technical-Stuff" style="width: 100px;float:left; margin-right:15px"/>
<br />

### Mini Challenge - 6
***
Make a prediction on the test dataset and print the accuracy

In [9]:
pred = grid_mse.predict(x_test)
accuracy_score(y_test,pred)

0.6374149659863946