In [1]:
# Install necessary libraries
!pip install xgboost pandas scikit-learn

Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl (190.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.3.1+cu121 requires nvidia-cublas-cu12==12.1.3.1; platform_system == "Linux" and platform_machine == "x86_64", which is not installed.
torch 2.3.1+cu121 requires nvidia-cuda-cupti-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", which is not installed.
torch 2.3.1+cu121 requires nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", which is not installed

In [2]:


# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score
import numpy as np
import xgboost as xgb

# Load the dataset
path = 'Ready_to_train.csv'
data = pd.read_csv(path)

# Drop samples with missing values
data.dropna(inplace=True)

# Get names of indexes for which column difference_in_days has value -1
indexNames = data[data['difference_in_days'] == -1].index
# Delete these row indexes from DataFrame
data.drop(indexNames, inplace=True)

# Update 'difference_in_days' values
data.loc[data.difference_in_days == 0, 'difference_in_days'] = 1
data.loc[data.difference_in_days == 2, 'difference_in_days'] = 1
data.loc[data.difference_in_days == 3, 'difference_in_days'] = 1
data.loc[data.difference_in_days > 3, 'difference_in_days'] = 0

# Separate features and target variable
X = data.drop('difference_in_days', axis=1)
y = data['difference_in_days']

# Initialize XGBoost classifier
model = xgb.XGBClassifier(eval_metric='logloss')

# Define hyperparameters grid
param_grid = {
    'learning_rate': [0.01, 0.1, 0.3],  # 'eta' is now 'learning_rate'
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'n_estimators': [100, 200, 300]
}

# Set up GridSearchCV with 5-fold cross-validation for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1)

# Fit the model to find the best hyperparameters
grid_search.fit(X, y)

# Best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

# Perform 5-fold cross-validation manually to calculate mean F1-score for the predictions
cv = KFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = []
accuracy_scores = []
precision_scores = []
recall_scores = []

best_estimator = grid_search.best_estimator_

for train_index, test_index in cv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    best_estimator.fit(X_train, y_train)
    y_pred = best_estimator.predict(X_test)

    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    f1_scores.append(f1)
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)
    recall_scores.append(recall)

    # Optionally, print classification report for each fold
    print(classification_report(y_test, y_pred))

# Calculate mean and standard deviation of F1-scores, accuracy, precision, and recall
mean_f1 = np.mean(f1_scores)
mean_accuracy = np.mean(accuracy_scores)
mean_precision = np.mean(precision_scores)
mean_recall = np.mean(recall_scores)

print(f"Mean F1-score: {mean_f1}")
print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean Precision: {mean_precision}")
print(f"Mean Recall: {mean_recall}")


Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Best Parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.6}
Best Score: 0.9143989692699233


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4910
           1       0.91      1.00      0.95     51903

    accuracy                           0.91     56813
   macro avg       0.46      0.50      0.48     56813
weighted avg       0.83      0.91      0.87     56813



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4797
           1       0.92      1.00      0.96     52016

    accuracy                           0.92     56813
   macro avg       0.46      0.50      0.48     56813
weighted avg       0.84      0.92      0.88     56813



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4867
           1       0.91      1.00      0.96     51945

    accuracy                           0.91     56812
   macro avg       0.46      0.50      0.48     56812
weighted avg       0.84      0.91      0.87     56812



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4866
           1       0.91      1.00      0.96     51946

    accuracy                           0.91     56812
   macro avg       0.46      0.50      0.48     56812
weighted avg       0.84      0.91      0.87     56812

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      4876
           1       0.91      1.00      0.96     51936

    accuracy                           0.91     56812
   macro avg       0.46      0.50      0.48     56812
weighted avg       0.84      0.91      0.87     56812

Mean F1-score: 0.8735123670641511
Mean Accuracy: 0.9143989680306355
Mean Precision: 0.8361258917398077
Mean Recall: 0.9143989680306355


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
