# Video 1: First Model

In [2]:
import pandas as pd

from ucimlrepo import fetch_ucirepo

# fetch dataset
superconductivty_data = fetch_ucirepo(id=464)

# data (as pandas dataframes)
X = superconductivty_data.data.features
y = superconductivty_data.data.targets

df = X.join(y)

df.head()

IncompleteRead: IncompleteRead(23408565 bytes read)

not sure what happened, for some reason my data is not working, will use heart disease data instead.

In [3]:
import pandas as pd

df = pd.read_csv('https://media.githubusercontent.com/media/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/refs/heads/master/Chapter02/heart_disease.csv')
y = df.iloc[:,-1]
X = df.iloc[:,:-1]
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

model = XGBClassifier()
scores = cross_val_score(
    model,
    X = X,
    y = y,
    cv = 5,
    scoring = 'accuracy'
)

In [5]:
print(scores)

[0.81967213 0.80327869 0.7704918  0.78333333 0.76666667]


In [7]:
print(scores.mean())

0.7886885245901639


# Video 2: Comparing Models

In [8]:
def clf(model):
    scores = cross_val_score(
        model,
        X = X,
        y = y,
        cv = 5,
        scoring = 'accuracy'
    )
    print(scores)
    print(scores.mean())

In [9]:
clf(XGBClassifier())

[0.81967213 0.80327869 0.7704918  0.78333333 0.76666667]
0.7886885245901639


In [10]:
from sklearn.linear_model import LogisticRegression

clf(LogisticRegression(max_iter = 5000))

[0.80327869 0.86885246 0.85245902 0.86666667 0.75      ]
0.8282513661202187


In [11]:
from sklearn.tree import DecisionTreeClassifier

clf(DecisionTreeClassifier())

[0.75409836 0.81967213 0.73770492 0.73333333 0.7       ]
0.7489617486338798


In [12]:
from sklearn.ensemble import RandomForestClassifier

clf(RandomForestClassifier())

[0.83606557 0.85245902 0.81967213 0.78333333 0.78333333]
0.8149726775956283


In [13]:
clf(XGBClassifier(max_depth = 1))

[0.86885246 0.86885246 0.80327869 0.83333333 0.76666667]
0.8281967213114754


In [19]:
clf(DecisionTreeClassifier(max_depth=3))

[0.80327869 0.85245902 0.80327869 0.81666667 0.75      ]
0.8051366120218578


In [20]:
clf(RandomForestClassifier(max_depth=2))

[0.85245902 0.90163934 0.86885246 0.86666667 0.76666667]
0.8512568306010928


# Video 3: GridSearchCV

In [22]:
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [1, 2, 3, 4],
    'subsample': [1, 0.85, 0.7, 0.55]
}

model = XGBClassifier()

grid_clf = GridSearchCV(
    model,
    params,
    scoring = 'accuracy',
    cv = 5
)
grid_clf.fit(X, y)

best_params = grid_clf.best_params_
best_score = grid_clf.best_score_

print(best_params)
print(best_score)

{'max_depth': 1, 'subsample': 1}
0.8281967213114754


In [23]:
def grid(params: dict[str:list], model: XGBClassifier):
    grid_clf = GridSearchCV(
        model,
        params,
        scoring = 'accuracy',
        cv = 5
    )
    grid_clf.fit(X, y)

    best_params = grid_clf.best_params_
    best_score = grid_clf.best_score_

    print(best_params)
    print(best_score)

In [None]:
params = {
    'max_depth':[1,2],
    'colsample_bytree':[1,0.8,0.6],
    'colsample_bytree':[1,0.8,0.6]
}
model = XGBClassifier()
grid(params, model)

{'colsample_bynode': 1, 'colsample_bytree': 1, 'max_depth': 1}
0.8281967213114754


In [None]:
params = {
    'max_depth':[1,2],
    'n_estimators':[50,100,200]
}
model = XGBClassifier()
grid(params, model)

{'max_depth': 1, 'n_estimators': 100}
0.8281967213114754


# Video 4: Fine Tuning

In [None]:
new_params = {
    'max_depth': [4],
    'subsample': [0.8],
    'colsample_bytree': [0.6,],
    'colsample_bynode': [0.4,],
    'colsample_bylevel': [0.8],
    'learning_rate': [0.1],
    'min_child_weight': [10],
    'gamma': [1.0]
}
model = XGBClassifier()

In [46]:
grid(new_params, model)

{'colsample_bylevel': 0.8, 'colsample_bynode': 0.4, 'colsample_bytree': 0.6, 'gamma': 1.0, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 10, 'subsample': 0.8}
0.8513114754098361


In [None]:
#going through a couple at a time, recording best:
best_params_result = {
    'max_depth': [4],
    'subsample': [0.8],
    'colsample_bytree': [0.6],
    'colsample_bynode': [0.4],
    'colsample_bylevel': [0.8],
    'learning_rate': [0.1],
    'min_child_weight': [10],
    'gamma': [1.0]
}

# Video 5: RandomizedSearchCV

In [49]:
# I think videos 4 and 5 might be in the wrong order, I watched video 4 and it seemed like it had
# skipped something, watching video 5 it seems like it should have gone before.
from sklearn.model_selection import RandomizedSearchCV

params = {
    'max_depth': [1,2,3,4,5,6,7],
    'subsample': [0.2,0.4,0.6,0.8,1],
    'colsample_bytree': [0.2,0.4,0.6,0.8,1],
    'colsample_bynode': [0.2,0.4,0.6,0.8],
    'colsample_bylevel': [0.4,0.6,0.8,1],
    'learning_rate': [0.05,0.1,0.2,0.4],
    'min_child_weight': [1,2,3,4,10],
    'gamma': [0.2,0.4,0.6,0.8,1.0]
}

def grid(params: dict[str:list], model: XGBClassifier, random: bool = False):
    if random:
        clf = RandomizedSearchCV(
            model,
            params,
            scoring = 'accuracy',
            cv = 5,
            n_iter=20
        )
    else:
        clf = GridSearchCV(
            model,
            params,
            scoring = 'accuracy',
            cv = 5
        )
    clf.fit(X, y)

    best_params = clf.best_params_
    best_score = clf.best_score_

    print(best_params)
    print(best_score)

In [50]:
grid(params, model, random = True)

{'subsample': 0.8, 'min_child_weight': 10, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 0.6, 'colsample_bynode': 0.4, 'colsample_bylevel': 0.8}
0.8546448087431694


In [57]:
from sklearn.model_selection import train_test_split

model = XGBClassifier(
    subsample = 0.8,
    min_child_weight = 10,
    max_depth = 6,
    learning_rate = 0.1,
    gamma = 0.2,
    colsample_bytree = 0.6,
    colsample_bynode = 0.4,
    colsample_bylevel = 0.8
)

X_train, X_test, y_train, y_test = train_test_split(X, y)

model.fit(X_train,y_train)
model.score(X_test,y_test)

0.881578947368421