In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

# Step 1: Load the data
data = pd.read_csv('ac.csv')

# Step 2: Separate features and labels
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Labels (the last column)

# Adjust labels to start from 0, if necessary
y = y - 1

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Define the XGBoost classifier and parameter grid
clf = XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss')

param_grid = {
    'max_depth': [3, 4, 5],
    'alpha': [1, 10],
    'learning_rate': [0.01, 0.1, 1.0],
    'n_estimators': [100, 200]
}

# Step 5: Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Step 6: Evaluate the best estimator found by GridSearchCV
best_clf = grid_search.best_estimator_

# Perform cross-validation on the training set
cv_scores = cross_val_score(best_clf, X_train, y_train, cv=5, scoring='accuracy')

# Print cross-validation scores
print(f'Cross-validation scores: {cv_scores}')
print(f'Mean cross-validation score: {cv_scores.mean():.2f}')

# Step 7: Evaluate the classifier on the test set
y_pred = best_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy on test set: {accuracy:.2f}')

# Print detailed classification report
print(classification_report(y_test, y_pred))

# Print the best parameters found by GridSearchCV
print(f'Best parameters: {grid_search.best_params_}')


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Cross-validation scores: [0.54609929 0.56028369 0.55319149 0.57446809 0.52857143]
Mean cross-validation score: 0.55
Accuracy on test set: 0.59
              precision    recall  f1-score   support

           0       0.54      0.71      0.61        82
           1       0.65      0.48      0.55        94

    accuracy                           0.59       176
   macro avg       0.60      0.59      0.58       176
weighted avg       0.60      0.59      0.58       176

Best parameters: {'alpha': 10, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 100}
