# TFIDF XGBoost Model

In [56]:
from joblib import load, dump
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import os

np.random.seed(478)
X_train_tfidf = load("model_train_tfidf.pkl")
y_train_tfidf = load("target.pkl")

X_test_tfidf = load("model_test_tfidf.pkl")

test_data = pd.read_csv("Data/test.csv") 

# Split 80% train 20% validation
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, y_train_tfidf, test_size=0.2, random_state=42)

xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=1.0,
    eval_metric='logloss', 
    early_stopping_rounds=50,
)

# Train model
xgb_model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              verbose=True)

# predictions on the val set
y_val_preds = xgb_model.predict(X_val)

f1 = f1_score(y_val, y_val_preds)


model_path = "xgb_model_tfidf.pkl" #Save
dump(xgb_model, model_path)

 
f1 # return F1

[0]	validation_0-logloss:0.66106
[1]	validation_0-logloss:0.64428
[2]	validation_0-logloss:0.63010
[3]	validation_0-logloss:0.62040
[4]	validation_0-logloss:0.61017
[5]	validation_0-logloss:0.60212
[6]	validation_0-logloss:0.59591
[7]	validation_0-logloss:0.58851
[8]	validation_0-logloss:0.58420
[9]	validation_0-logloss:0.57921
[10]	validation_0-logloss:0.57485
[11]	validation_0-logloss:0.57100
[12]	validation_0-logloss:0.56863
[13]	validation_0-logloss:0.56590
[14]	validation_0-logloss:0.56314
[15]	validation_0-logloss:0.56103
[16]	validation_0-logloss:0.55881
[17]	validation_0-logloss:0.55700
[18]	validation_0-logloss:0.55524
[19]	validation_0-logloss:0.55324
[20]	validation_0-logloss:0.55220
[21]	validation_0-logloss:0.55043
[22]	validation_0-logloss:0.54914
[23]	validation_0-logloss:0.54759
[24]	validation_0-logloss:0.54670
[25]	validation_0-logloss:0.54567
[26]	validation_0-logloss:0.54483
[27]	validation_0-logloss:0.54366
[28]	validation_0-logloss:0.54245
[29]	validation_0-loglos

0.7069243156199678

In [57]:

y_test_preds = xgb_model.predict(X_test_tfidf) # Predictions for submission

test_data_filtered = test_data.iloc[:len(y_test_preds)].copy()

submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})

submission.to_csv("xgboost_tfidf_submission.csv", index=False)

print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       0
1   2       1
2   3       1
3   9       0
4  11       1


# TFIDF Model With Optuna Hyperparameter Optimization

In [58]:
import optuna
from sklearn.model_selection import cross_val_score
def objective(trial):
    # Define search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5.0)
    }

    model = XGBClassifier(**params, eval_metric='logloss')
    f1 = cross_val_score(model, X_train, y_train, cv=3, scoring='f1').mean()
    
    return f1

# Optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print("Best Hyperparameters:", study.best_params)

# Train final model using best parameters
best_params = study.best_params
xgb_model = XGBClassifier(**best_params, eval_metric='logloss')
xgb_model.fit(X_train_tfidf, y_train_tfidf)

dump(xgb_model, "xgb_model_optimized_tfidf.pkl")# Save


[I 2025-02-24 14:22:30,521] A new study created in memory with name: no-name-5f1bb6ec-fb2b-4627-9d77-528fd044c3bc
[I 2025-02-24 14:22:37,253] Trial 0 finished with value: 0.7150988299635624 and parameters: {'n_estimators': 262, 'learning_rate': 0.10307368823115903, 'max_depth': 8, 'subsample': 0.5665642529489734, 'colsample_bytree': 0.6194220704186375, 'reg_alpha': 0.5126035477065661, 'reg_lambda': 0.586066932168332}. Best is trial 0 with value: 0.7150988299635624.
[I 2025-02-24 14:22:53,700] Trial 1 finished with value: 0.7081595452156478 and parameters: {'n_estimators': 493, 'learning_rate': 0.07312723078783483, 'max_depth': 10, 'subsample': 0.7131103461538792, 'colsample_bytree': 0.6724431237853012, 'reg_alpha': 0.5293877953170274, 'reg_lambda': 1.4747587909863935}. Best is trial 0 with value: 0.7150988299635624.
[I 2025-02-24 14:22:57,088] Trial 2 finished with value: 0.6992887026140168 and parameters: {'n_estimators': 113, 'learning_rate': 0.2599097708149919, 'max_depth': 6, 'subs

Best Hyperparameters: {'n_estimators': 314, 'learning_rate': 0.050211457741447654, 'max_depth': 10, 'subsample': 0.9891736635930476, 'colsample_bytree': 0.7237150781374093, 'reg_alpha': 0.8255200485346037, 'reg_lambda': 1.8995621653010517}


['xgb_model_optimized_tfidf.pkl']

In [59]:
import pandas as pd
from joblib import load

xgb_model = load("xgb_model_optimized_tfidf.pkl")

y_test_preds = xgb_model.predict(X_test_tfidf)

submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})
submission.to_csv("xgboost_submission_optimized_tfidf.csv", index=False)

print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       0
1   2       1
2   3       1
3   9       0
4  11       1


# Count Vectorizer XGBoost Model

In [60]:
X_train_count = load("model_train_count.pkl")
y_train_count = load("target.pkl")

X_test_count = load("model_test_count.pkl")

# Split
X_train_c, X_val_c, y_train_c, y_val_c = train_test_split(X_train_count, y_train_count, test_size=0.2, random_state=42)

# Train model
xgb_model.fit(X_train_c, y_train_c,
              eval_set=[(X_val_c, y_val_c)],
              verbose=True)

# predictions on the validation set
y_val_preds_c = xgb_model.predict(X_val_c)

f1 = f1_score(y_val_c, y_val_preds_c)

model_path = "xgb_model_count.pkl" #save
dump(xgb_model, model_path)

# Return F1
f1

[0]	validation_0-logloss:0.67064
[1]	validation_0-logloss:0.66009
[2]	validation_0-logloss:0.65211
[3]	validation_0-logloss:0.64390
[4]	validation_0-logloss:0.63584
[5]	validation_0-logloss:0.62928
[6]	validation_0-logloss:0.62480
[7]	validation_0-logloss:0.61831
[8]	validation_0-logloss:0.61494
[9]	validation_0-logloss:0.60902
[10]	validation_0-logloss:0.60361
[11]	validation_0-logloss:0.59875
[12]	validation_0-logloss:0.59458
[13]	validation_0-logloss:0.59025
[14]	validation_0-logloss:0.58625
[15]	validation_0-logloss:0.58270
[16]	validation_0-logloss:0.58046
[17]	validation_0-logloss:0.57761
[18]	validation_0-logloss:0.57514
[19]	validation_0-logloss:0.57235
[20]	validation_0-logloss:0.56975
[21]	validation_0-logloss:0.56724
[22]	validation_0-logloss:0.56504
[23]	validation_0-logloss:0.56372
[24]	validation_0-logloss:0.56157
[25]	validation_0-logloss:0.55977
[26]	validation_0-logloss:0.55794
[27]	validation_0-logloss:0.55633
[28]	validation_0-logloss:0.55480
[29]	validation_0-loglos

0.7198067632850241

In [61]:
#Predictions for Submission
y_test_preds_c = xgb_model.predict(X_test_count)

# Create Submission DataFrame
submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds_c})

# Save Submission File
submission.to_csv("xgboost_count_submission.csv", index=False)

# Print first few rows for verification
print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       0
1   2       1
2   3       1
3   9       0
4  11       1


# Count Vectorizer Model With Optuna Hyperparameter Optimization

In [62]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5.0)
    }

    model = XGBClassifier(**params, eval_metric='logloss')
    f1 = cross_val_score(model, X_train_c, y_train_c, cv=3, scoring='f1').mean()
    
    return f1

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Best parameters
print("Best Hyperparameters:", study.best_params)

best_params = study.best_params
xgb_model = XGBClassifier(**best_params, eval_metric='logloss')
xgb_model.fit(X_train_c, y_train_c)

dump(xgb_model, "xgb_model_optimized_count.pkl")

[I 2025-02-24 14:25:48,178] A new study created in memory with name: no-name-b8e0a508-4139-4786-8c22-5e96e3c09d45
[I 2025-02-24 14:25:52,128] Trial 0 finished with value: 0.7119400205753651 and parameters: {'n_estimators': 174, 'learning_rate': 0.2789286926674607, 'max_depth': 8, 'subsample': 0.637137159220494, 'colsample_bytree': 0.6438740827009533, 'reg_alpha': 0.8105044747887631, 'reg_lambda': 3.713242306781134}. Best is trial 0 with value: 0.7119400205753651.
[I 2025-02-24 14:25:56,235] Trial 1 finished with value: 0.7134620034727607 and parameters: {'n_estimators': 286, 'learning_rate': 0.09650980532869877, 'max_depth': 4, 'subsample': 0.857836208156348, 'colsample_bytree': 0.8800688499906892, 'reg_alpha': 0.8677865015801322, 'reg_lambda': 1.0841117102529445}. Best is trial 1 with value: 0.7134620034727607.
[I 2025-02-24 14:25:57,846] Trial 2 finished with value: 0.693370780330698 and parameters: {'n_estimators': 102, 'learning_rate': 0.13364660294138603, 'max_depth': 4, 'subsampl

Best Hyperparameters: {'n_estimators': 355, 'learning_rate': 0.05507949082153511, 'max_depth': 9, 'subsample': 0.8014383711577121, 'colsample_bytree': 0.6616298293018448, 'reg_alpha': 0.6337273665497689, 'reg_lambda': 0.6413851065087809}


['xgb_model_optimized_count.pkl']

In [63]:
import pandas as pd
from joblib import load

xgb_model = load("xgb_model_optimized_count.pkl")

y_test_preds = xgb_model.predict(X_test_count)


submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})
submission.to_csv("xgboost_submission_optimized_count.csv", index=False)

print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       1
1   2       1
2   3       1
3   9       0
4  11       1
