# ANA680_Week_3:

### Building XGB Model

## Set up environment

In [1]:
import pandas as pd
#import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, confusion_matrix, 
                             precision_score, recall_score, f1_score)
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import pickle
from ucimlrepo import fetch_ucirepo

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


## Load and inspect data

In [2]:
# Fetch dataset
wine_quality = fetch_ucirepo(id=186) 
wine_quality.data.features

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8


In [3]:
target = wine_quality.data.targets
target.value_counts()

quality
6          2836
5          2138
7          1079
4           216
8           193
3            30
9             5
dtype: int64

## Split into Train(80%) and Test(20%)

In [4]:
# Split features and target
X = wine_quality.data.features
y = wine_quality.data.targets.squeeze()

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=680)

## XGBOOST

In [5]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

# Train XGBoost
model_xgb = XGBClassifier(eval_metric='logloss')
model_xgb.fit(X_train, y_train_encoded)
y_pred_encoded = model_xgb.predict(X_test)

# Decode predictions to original labels if needed
y_pred = le.inverse_transform(y_pred_encoded)

# Evaluate
xgb_accuracy = accuracy_score(y_test, y_pred)
xgb_precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
xgb_recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
xgb_f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print(f"XGBoost Accuracy: {round(xgb_accuracy, 4)}")
print(f"XGBoost Precision: {round(xgb_precision, 4)}")
print(f"XGBoost Recall: {round(xgb_recall, 4)}")
print(f"XGBoost F1-score: {round(xgb_f1, 4)}")

XGBoost Accuracy: 0.6592
XGBoost Precision: 0.6572
XGBoost Recall: 0.6592
XGBoost F1-score: 0.6482


## Pickle It

In [6]:
with open("xgb_bundle.pkl", "wb") as f:
    pickle.dump({"model": model_xgb}, f)

# END