In [60]:
# Import neeeded libraries
import pandas as pd
import numpy as np
import xgboost as xgb

In [61]:
# Load the training and testing datasets from CSV files into Pandas DataFrames
train_data = pd.read_csv('playground_train.csv', index_col=0)  # Loading the training data
test_data = pd.read_csv('playground_test.csv', index_col=0)   # Loading the testing data

In [62]:
# Splitting the training dataset into features and target variable
X_train = train_data.drop(columns='Class')  # Assigning features to X_train
y_train = train_data['Class']  # Assigning the target variable to y_train
X_test = test_data  # Preparing the test dataset for prediction

In [63]:
# Displaying the shapes of the training and testing datasets
X_train.shape, y_train.shape, X_test.shape

((219129, 30), (219129,), (146087, 30))

In [103]:
# Import necessary modules from scikit-learn
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# Create a pipeline with preprocessing steps and an XGBoost classifier
pipeline = Pipeline([
    ("scale", StandardScaler()),    # Step 1: Standardize the features
    ("xgb", xgb.XGBClassifier())    # Step 2: Use XGBoost classifier
])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)


In [107]:
# Import necessary module for grid search
from sklearn.model_selection import GridSearchCV

# Set up a search grid for hyperparameter tuning
param_grid = {
    "xgb__max_depth":    [4, 5],            # Hyperparameter options for max depth
    "xgb__n_estimators": [500, 600, 700],    # Hyperparameter options for number of estimators
    "xgb__learning_rate": [0.01, 0.015]      # Hyperparameter options for learning rate
}

In [108]:
# Utilizing GridSearchCV to perform an exhaustive search over a specified parameter grid
# This helps fine-tune the hyperparameters of the pipeline for optimal model performance
search = GridSearchCV(pipeline, param_grid=param_grid)

In [109]:
# Fitting the GridSearchCV to the training data to find the best combination of hyperparameters
search.fit(X_train, y_train)

In [110]:
# Getting the best parameters for the model
search.best_params_

{'xgb__learning_rate': 0.01, 'xgb__max_depth': 5, 'xgb__n_estimators': 700}

In [112]:
# Extracting the best estimator from the GridSearchCV results
final_xgb = search.best_estimator_

In [114]:
# Generating predictions using the trained XGBoost model on the test data
predictions = final_xgb.predict(X_test)

In [115]:
# Create a DataFrame with predicted sale prices and save it to a CSV file for submission
output = pd.DataFrame({"Id": test_data.index, "Class": predictions})
output.to_csv('submission.csv', index=False)