# TFIDF XGBoost Model

In [11]:
from joblib import load, dump
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
import os

# Load TFIDF Cleaned Data
X_train_tfidf = load("model_train_tfidf.pkl")
y_train_tfidf = load("target.pkl")

X_test_tfidf = load("model_test_tfidf.pkl")

test_data = pd.read_csv("Data/test.csv") # Load test dataset to get 'id' column

# Split into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, y_train_tfidf, test_size=0.2, random_state=42)

# Define the XGBoost model with initial hyperparameters
xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=1.0,
    eval_metric='logloss' 
)

# Train model
xgb_model.fit(X_train, y_train)

# predictions on the validation set
y_val_preds = xgb_model.predict(X_val)

# F1 Score
f1 = f1_score(y_val, y_val_preds)

# Save the trained model
model_path = "xgb_model_tfidf.pkl" 
dump(xgb_model, model_path)

# Return F1 Score
f1

0.7069243156199678

In [12]:
# Generate Predictions for Submission
y_test_preds = xgb_model.predict(X_test_tfidf)

# Match test_data to X_test_tfidf in size (remove extra rows if necessary)
test_data_filtered = test_data.iloc[:len(y_test_preds)].copy()

# Create Submission DataFrame
submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})

# Save Submission File
submission.to_csv("xgboost_tfidf_submission.csv", index=False)

# Print first few rows for verification
print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       0
1   2       1
2   3       1
3   9       0
4  11       1


# TFIDF Model With Optuna Hyperparameter Optimization

In [13]:
import optuna
from sklearn.model_selection import cross_val_score
set.seed(478)
def objective(trial):
    # Define search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5.0)
    }

    model = XGBClassifier(**params, eval_metric='logloss')
    f1 = cross_val_score(model, X_train, y_train, cv=3, scoring='f1').mean()
    
    return f1

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Best parameters
print("Best Hyperparameters:", study.best_params)

# Train final model using best parameters
best_params = study.best_params
xgb_model = XGBClassifier(**best_params, eval_metric='logloss')
xgb_model.fit(X_train_tfidf, y_train_tfidf)

# Save the optimized model
dump(xgb_model, "xgb_model_optimized_tfidf.pkl")


[I 2025-02-19 16:16:51,997] A new study created in memory with name: no-name-ac916f15-13c1-412d-8ff4-89568df89558
[I 2025-02-19 16:16:56,398] Trial 0 finished with value: 0.677411206286573 and parameters: {'n_estimators': 218, 'learning_rate': 0.019632611406732395, 'max_depth': 5, 'subsample': 0.743051720575016, 'colsample_bytree': 0.6217964265129498, 'reg_alpha': 0.6892425127781228, 'reg_lambda': 4.551123878031604}. Best is trial 0 with value: 0.677411206286573.
[I 2025-02-19 16:17:12,239] Trial 1 finished with value: 0.6906221608915578 and parameters: {'n_estimators': 400, 'learning_rate': 0.23754863954927946, 'max_depth': 9, 'subsample': 0.6194251776704998, 'colsample_bytree': 0.6369168862727841, 'reg_alpha': 0.44478075028015673, 'reg_lambda': 4.829031693908829}. Best is trial 1 with value: 0.6906221608915578.
[I 2025-02-19 16:17:16,560] Trial 2 finished with value: 0.7111304805509343 and parameters: {'n_estimators': 149, 'learning_rate': 0.16545866320504052, 'max_depth': 6, 'subsam

Best Hyperparameters: {'n_estimators': 289, 'learning_rate': 0.15587134731818175, 'max_depth': 7, 'subsample': 0.862998612646322, 'colsample_bytree': 0.8259252447584499, 'reg_alpha': 0.2372918342531305, 'reg_lambda': 0.9282859574138145}


['xgb_model_optimized_tfidf.pkl']

In [18]:
import pandas as pd
from joblib import load

xgb_model = load("xgb_model_optimized_tfidf.pkl")
X_test_tfidf = load("model_test_tfidf.pkl")

y_test_preds = xgb_model.predict(X_test_tfidf)

test_data = pd.read_csv("Data/test.csv")
test_data_filtered = test_data.iloc[:len(y_test_preds)].copy()

submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})
submission.to_csv("xgboost_submission_optimized_tfidf.csv", index=False)

print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       0
1   2       1
2   3       1
3   9       0
4  11       1


# Count Vectorizer XGBoost Model

In [14]:
# Load Count Vectorizer Cleaned Data
X_train_count = load("model_train_count.pkl")
y_train_count = load("target.pkl")

X_test_count = load("model_test_count.pkl")

test_data = pd.read_csv("Data/test.csv") # Load test dataset to get 'id' column 

# Split into training and validation sets (80% train, 20% validation)
X_train_c, X_val_c, y_train_c, y_val_c = train_test_split(X_train_count, y_train_count, test_size=0.2, random_state=42)

# Define the XGBoost model with initial hyperparameters
xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=1.0,
    eval_metric='logloss'  
)

# Train model
xgb_model.fit(X_train_c, y_train_c)

# predictions on the validation set
y_val_preds_c = xgb_model.predict(X_val_c)

# F1 Score
f1 = f1_score(y_val_c, y_val_preds_c)

# Save the trained model
model_path = "xgb_model_count.pkl"
dump(xgb_model, model_path)

# Return F1 Score
f1

0.719482619240097

In [15]:
# Generate Predictions for Submission
y_test_preds_c = xgb_model.predict(X_test_count)

# Match test_data to X_test_tfidf in size (remove extra rows if necessary)
test_data_filtered = test_data.iloc[:len(y_test_preds_c)].copy()

# Create Submission DataFrame
submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds_c})

# Save Submission File
submission.to_csv("xgboost_count_submission.csv", index=False)

# Print first few rows for verification
print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       1
1   2       1
2   3       1
3   9       0
4  11       1


# Count Vectorizer Model With Optuna Hyperparameter Optimization

In [21]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5.0)
    }

    model = XGBClassifier(**params, eval_metric='logloss')
    f1 = cross_val_score(model, X_train_c, y_train_c, cv=3, scoring='f1').mean()
    
    return f1

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Best parameters
print("Best Hyperparameters:", study.best_params)

best_params = study.best_params
xgb_model = XGBClassifier(**best_params, eval_metric='logloss')
xgb_model.fit(X_train_c, y_train_c)

dump(xgb_model, "xgb_model_optimized_count.pkl")

[I 2025-02-19 16:33:30,143] A new study created in memory with name: no-name-7c61a896-c655-4a35-a036-8a2a3bc6bf28
[I 2025-02-19 16:33:35,294] Trial 0 finished with value: 0.7048207566362833 and parameters: {'n_estimators': 324, 'learning_rate': 0.03260404183588316, 'max_depth': 6, 'subsample': 0.8087635913896936, 'colsample_bytree': 0.8853897625559173, 'reg_alpha': 0.5597749354847169, 'reg_lambda': 2.1727844095531204}. Best is trial 0 with value: 0.7048207566362833.
[I 2025-02-19 16:33:39,155] Trial 1 finished with value: 0.7053393624328775 and parameters: {'n_estimators': 386, 'learning_rate': 0.03506397512669123, 'max_depth': 4, 'subsample': 0.9323570087697401, 'colsample_bytree': 0.7297579072628659, 'reg_alpha': 0.8803378155994889, 'reg_lambda': 0.054430905773785176}. Best is trial 1 with value: 0.7053393624328775.
[I 2025-02-19 16:33:44,198] Trial 2 finished with value: 0.7197301443932876 and parameters: {'n_estimators': 305, 'learning_rate': 0.10177518057726331, 'max_depth': 7, 's

Best Hyperparameters: {'n_estimators': 490, 'learning_rate': 0.1650885294803864, 'max_depth': 9, 'subsample': 0.9999729126463341, 'colsample_bytree': 0.599342143572356, 'reg_alpha': 0.2946488601398795, 'reg_lambda': 3.1010803110061276}


['xgb_model_optimized_count.pkl']

In [22]:
import pandas as pd
from joblib import load

xgb_model = load("xgb_model_optimized_count.pkl")
X_test_tfidf = load("model_test_count.pkl")

y_test_preds = xgb_model.predict(X_test_count)

test_data = pd.read_csv("Data/test.csv")
test_data_filtered = test_data.iloc[:len(y_test_preds)].copy()

submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})
submission.to_csv("xgboost_submission_optimized_count.csv", index=False)

print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       0
1   2       1
2   3       1
3   9       0
4  11       1
