# TFIDF MODEL

In [78]:
from joblib import load, dump
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import os

np.random.seed(478)
X_train_tfidf = load("../pkl_files/model_train_tfidf.pkl")
y_train_tfidf = load("../pkl_files/target.pkl")

X_test_tfidf = load("../pkl_files/model_test_tfidf.pkl")

test_data = pd.read_csv("../Data/test.csv") 

# Split 80% train 20% validation
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, y_train_tfidf, test_size=0.2, random_state=42)

xgb_model = XGBClassifier(
    n_estimators=800,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=1.0,
    eval_metric='logloss', 
    early_stopping_rounds=50,
)

# Train model
xgb_model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              verbose=True)

# predictions on the val set
y_val_preds = xgb_model.predict(X_val)

f1 = f1_score(y_val, y_val_preds)


model_path = "../pkl_files/xgb_model_tfidf.pkl" #Save
dump(xgb_model, model_path)

 
f1 # return F1

[0]	validation_0-logloss:0.66106
[1]	validation_0-logloss:0.64428
[2]	validation_0-logloss:0.62967
[3]	validation_0-logloss:0.61931
[4]	validation_0-logloss:0.60922
[5]	validation_0-logloss:0.60114
[6]	validation_0-logloss:0.59487
[7]	validation_0-logloss:0.58803
[8]	validation_0-logloss:0.58332
[9]	validation_0-logloss:0.57904
[10]	validation_0-logloss:0.57497
[11]	validation_0-logloss:0.57103
[12]	validation_0-logloss:0.56835
[13]	validation_0-logloss:0.56594
[14]	validation_0-logloss:0.56344
[15]	validation_0-logloss:0.56139
[16]	validation_0-logloss:0.55924
[17]	validation_0-logloss:0.55755
[18]	validation_0-logloss:0.55547
[19]	validation_0-logloss:0.55292
[20]	validation_0-logloss:0.55142
[21]	validation_0-logloss:0.54932
[22]	validation_0-logloss:0.54782
[23]	validation_0-logloss:0.54652
[24]	validation_0-logloss:0.54522
[25]	validation_0-logloss:0.54422
[26]	validation_0-logloss:0.54337
[27]	validation_0-logloss:0.54175
[28]	validation_0-logloss:0.54065
[29]	validation_0-loglos

0.7296416938110749

In [79]:
y_test_preds = xgb_model.predict(X_test_tfidf) # Predictions for submission

test_data_filtered = test_data.iloc[:len(y_test_preds)].copy()

submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})

submission.to_csv("../csv_files/xgboost_tfidf_submission.csv", index=False)

print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       0
1   2       1
2   3       1
3   9       0
4  11       1


# TFIDF OPTIMIZED

In [80]:
import optuna
from sklearn.model_selection import cross_val_score
def objective(trial):
    # Define search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 700),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 5),
        'subsample': trial.suggest_float('subsample', 0.8, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 0.95),  
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.9, 1.0),
    }

    model = XGBClassifier(**params, eval_metric='logloss')
    f1 = cross_val_score(model, X_train, y_train, cv=10, scoring='f1').mean()
    
    return f1

# Optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print("Best Hyperparameters:", study.best_params)

# Train final model using best parameters
best_params = study.best_params
xgb_model = XGBClassifier(**best_params, eval_metric='logloss')
xgb_model.fit(X_train_tfidf, y_train_tfidf)

dump(xgb_model, "../pkl_files/xgb_model_optimized_tfidf.pkl")# Save

[I 2025-02-25 01:56:46,721] A new study created in memory with name: no-name-beec9dd3-a665-46a0-9da9-f844bed65c4c
[I 2025-02-25 01:57:35,583] Trial 0 finished with value: 0.7224481732789126 and parameters: {'n_estimators': 592, 'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.9454150930186873, 'colsample_bytree': 0.8753395188809447, 'scale_pos_weight': 0.9880763712973664}. Best is trial 0 with value: 0.7224481732789126.
[I 2025-02-25 01:58:30,469] Trial 1 finished with value: 0.7239069817389265 and parameters: {'n_estimators': 660, 'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.9417077754661733, 'colsample_bytree': 0.9357168902693248, 'scale_pos_weight': 0.9506087266821096}. Best is trial 1 with value: 0.7239069817389265.
[I 2025-02-25 01:59:26,514] Trial 2 finished with value: 0.7170307751462148 and parameters: {'n_estimators': 668, 'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.9353490852284938, 'colsample_bytree': 0.9287275152457559, 'scale_pos_weight': 0.95203590392

Best Hyperparameters: {'n_estimators': 660, 'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.9417077754661733, 'colsample_bytree': 0.9357168902693248, 'scale_pos_weight': 0.9506087266821096}


['../pkl_files/xgb_model_optimized_tfidf.pkl']

In [81]:
xgb_model = load("../pkl_files/xgb_model_optimized_tfidf.pkl")
888
y_test_preds = xgb_model.predict(X_test_tfidf)

submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})
submission.to_csv("../csv_files/xgboost_submission_optimized_tfidf.csv", index=False)

print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       0
1   2       1
2   3       1
3   9       0
4  11       1


# COUNT VECTORIZER MODEL

In [82]:
X_train_count = load("../pkl_files/model_train_count.pkl")
y_train_count = load("../pkl_files/target.pkl")

X_test_count = load("../pkl_files/model_test_count.pkl")

# Split
X_train_c, X_val_c, y_train_c, y_val_c = train_test_split(X_train_count, y_train_count, test_size=0.2, random_state=42)

# Train model
xgb_model.fit(X_train_c, y_train_c,
              eval_set=[(X_val_c, y_val_c)],
              verbose=True)

# predictions on the validation set
y_val_preds_c = xgb_model.predict(X_val_c)

f1 = f1_score(y_val_c, y_val_preds_c)

model_path = "../pkl_files/xgb_model_count.pkl" #save
dump(xgb_model, model_path)

# Return F1
f1

[0]	validation_0-logloss:0.66099
[1]	validation_0-logloss:0.64324
[2]	validation_0-logloss:0.62936
[3]	validation_0-logloss:0.61774
[4]	validation_0-logloss:0.60797
[5]	validation_0-logloss:0.59992
[6]	validation_0-logloss:0.59307
[7]	validation_0-logloss:0.58714
[8]	validation_0-logloss:0.58247
[9]	validation_0-logloss:0.57834
[10]	validation_0-logloss:0.57458
[11]	validation_0-logloss:0.57103
[12]	validation_0-logloss:0.56811
[13]	validation_0-logloss:0.56552
[14]	validation_0-logloss:0.56306
[15]	validation_0-logloss:0.56053
[16]	validation_0-logloss:0.55878
[17]	validation_0-logloss:0.55696
[18]	validation_0-logloss:0.55502
[19]	validation_0-logloss:0.55320
[20]	validation_0-logloss:0.55208
[21]	validation_0-logloss:0.54995
[22]	validation_0-logloss:0.54841
[23]	validation_0-logloss:0.54671
[24]	validation_0-logloss:0.54591
[25]	validation_0-logloss:0.54486
[26]	validation_0-logloss:0.54367
[27]	validation_0-logloss:0.54291
[28]	validation_0-logloss:0.54192
[29]	validation_0-loglos

0.7312186978297162

In [None]:
y_test_preds_c = xgb_model.predict(X_test_count)

submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds_c})

submission.to_csv("../csv_files/xgboost_count_submission.csv", index=False)

print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       1
1   2       1
2   3       1
3   9       0
4  11       1


# COUNT VECTORIZER OPTIMIZED

In [84]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 500, 700),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.1),
        'max_depth': trial.suggest_int('max_depth', 5, 5),
        'subsample': trial.suggest_float('subsample', 0.8, 0.95),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.8, 0.95),  
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.9, 1.0),
    }

    model = XGBClassifier(**params, eval_metric='logloss')
    f1 = cross_val_score(model, X_train_c, y_train_c, cv=10, scoring='f1').mean()
    
    return f1

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Best parameters
print("Best Hyperparameters:", study.best_params)

best_params = study.best_params
xgb_model = XGBClassifier(**best_params, eval_metric='logloss')
xgb_model.fit(X_train_c, y_train_c)

dump(xgb_model, "../pkl_files/xgb_model_optimized_count.pkl")

[I 2025-02-25 02:06:10,444] A new study created in memory with name: no-name-a3d3b619-2fe2-48c9-b695-7d398db7f596
[I 2025-02-25 02:06:42,100] Trial 0 finished with value: 0.7254936631043398 and parameters: {'n_estimators': 517, 'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.8775632973240751, 'colsample_bytree': 0.8002962137665731, 'scale_pos_weight': 0.9359328901160813}. Best is trial 0 with value: 0.7254936631043398.
[I 2025-02-25 02:07:18,338] Trial 1 finished with value: 0.7313696623302254 and parameters: {'n_estimators': 574, 'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.8627380732617186, 'colsample_bytree': 0.9319355214416476, 'scale_pos_weight': 0.9879800819632866}. Best is trial 1 with value: 0.7313696623302254.
[I 2025-02-25 02:07:55,501] Trial 2 finished with value: 0.7285023042281535 and parameters: {'n_estimators': 632, 'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.8269497229027484, 'colsample_bytree': 0.8847102330178336, 'scale_pos_weight': 0.95279652452

Best Hyperparameters: {'n_estimators': 574, 'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.8627380732617186, 'colsample_bytree': 0.9319355214416476, 'scale_pos_weight': 0.9879800819632866}


['../pkl_files/xgb_model_optimized_count.pkl']

In [85]:
xgb_model = load("../pkl_files/xgb_model_optimized_count.pkl")

y_test_preds = xgb_model.predict(X_test_count)


submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})
submission.to_csv("../csv_files/xgboost_submission_optimized_count.csv", index=False)

print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       1
1   2       1
2   3       1
3   9       0
4  11       1
