# TFIDF MODEL

In [7]:
from joblib import load, dump
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
import os

np.random.seed(478)
X_train_tfidf = load("../pkl_files/model_train_tfidf.pkl")
y_train_tfidf = load("../pkl_files/target.pkl")

X_test_tfidf = load("../pkl_files/model_test_tfidf.pkl")

test_data = pd.read_csv("../Data/test.csv") 

# Split 80% train 20% validation
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, y_train_tfidf, test_size=0.2, random_state=42)

xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=1.0,
    eval_metric='logloss', 
    early_stopping_rounds=50,
)

# Train model
xgb_model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              verbose=True)

# predictions on the val set
y_val_preds = xgb_model.predict(X_val)

f1 = f1_score(y_val, y_val_preds)


model_path = "../pkl_files/xgb_model_tfidf.pkl" #Save
dump(xgb_model, model_path)

 
f1 # return F1

[0]	validation_0-logloss:0.66106
[1]	validation_0-logloss:0.64428
[2]	validation_0-logloss:0.62967
[3]	validation_0-logloss:0.61931
[4]	validation_0-logloss:0.60922
[5]	validation_0-logloss:0.60114
[6]	validation_0-logloss:0.59487
[7]	validation_0-logloss:0.58803
[8]	validation_0-logloss:0.58332
[9]	validation_0-logloss:0.57904
[10]	validation_0-logloss:0.57497
[11]	validation_0-logloss:0.57103
[12]	validation_0-logloss:0.56835
[13]	validation_0-logloss:0.56594
[14]	validation_0-logloss:0.56344
[15]	validation_0-logloss:0.56139
[16]	validation_0-logloss:0.55924
[17]	validation_0-logloss:0.55755
[18]	validation_0-logloss:0.55547
[19]	validation_0-logloss:0.55292
[20]	validation_0-logloss:0.55142
[21]	validation_0-logloss:0.54932
[22]	validation_0-logloss:0.54782
[23]	validation_0-logloss:0.54652
[24]	validation_0-logloss:0.54522
[25]	validation_0-logloss:0.54422
[26]	validation_0-logloss:0.54337
[27]	validation_0-logloss:0.54175
[28]	validation_0-logloss:0.54065
[29]	validation_0-loglos

0.7105691056910569

In [8]:
y_test_preds = xgb_model.predict(X_test_tfidf) # Predictions for submission

test_data_filtered = test_data.iloc[:len(y_test_preds)].copy()

submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})

submission.to_csv("../csv_files/xgboost_tfidf_submission.csv", index=False)

print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       0
1   2       1
2   3       0
3   9       0
4  11       1


# TFIDF OPTIMIZED

In [9]:
import optuna
from sklearn.model_selection import cross_val_score
def objective(trial):
    # Define search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5.0)
    }

    model = XGBClassifier(**params, eval_metric='logloss')
    f1 = cross_val_score(model, X_train, y_train, cv=3, scoring='f1').mean()
    
    return f1

# Optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print("Best Hyperparameters:", study.best_params)

# Train final model using best parameters
best_params = study.best_params
xgb_model = XGBClassifier(**best_params, eval_metric='logloss')
xgb_model.fit(X_train_tfidf, y_train_tfidf)

dump(xgb_model, "../pkl_files/xgb_model_optimized_tfidf.pkl")# Save

[I 2025-02-24 15:20:44,939] A new study created in memory with name: no-name-2fb1dcea-2306-432b-92c4-0a1ddd689a2b
[I 2025-02-24 15:20:51,692] Trial 0 finished with value: 0.7036094559504414 and parameters: {'n_estimators': 315, 'learning_rate': 0.07263317566886052, 'max_depth': 4, 'subsample': 0.6367246900591977, 'colsample_bytree': 0.6331887145819868, 'reg_alpha': 0.6034417186378427, 'reg_lambda': 2.0250970306905907}. Best is trial 0 with value: 0.7036094559504414.
[I 2025-02-24 15:21:05,212] Trial 1 finished with value: 0.7073616791449364 and parameters: {'n_estimators': 233, 'learning_rate': 0.22818474703744268, 'max_depth': 10, 'subsample': 0.8789223338144208, 'colsample_bytree': 0.8722310497881571, 'reg_alpha': 0.296505867756772, 'reg_lambda': 0.5483318836220363}. Best is trial 1 with value: 0.7073616791449364.
[I 2025-02-24 15:21:20,077] Trial 2 finished with value: 0.7022571820730535 and parameters: {'n_estimators': 443, 'learning_rate': 0.2477242877334673, 'max_depth': 5, 'subs

Best Hyperparameters: {'n_estimators': 286, 'learning_rate': 0.056123158458219115, 'max_depth': 9, 'subsample': 0.7784513412480358, 'colsample_bytree': 0.7152643216884059, 'reg_alpha': 0.39683245278601886, 'reg_lambda': 1.3171342104733543}


['../pkl_files/xgb_model_optimized_tfidf.pkl']

In [10]:
xgb_model = load("../pkl_files/xgb_model_optimized_tfidf.pkl")

y_test_preds = xgb_model.predict(X_test_tfidf)

submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})
submission.to_csv("../csv_files/xgboost_submission_optimized_tfidf.csv", index=False)

print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       0
1   2       1
2   3       1
3   9       0
4  11       1


# COUNT VECTORIZER MODEL

In [11]:
X_train_count = load("../pkl_files/model_train_count.pkl")
y_train_count = load("../pkl_files/target.pkl")

X_test_count = load("../pkl_files/model_test_count.pkl")

# Split
X_train_c, X_val_c, y_train_c, y_val_c = train_test_split(X_train_count, y_train_count, test_size=0.2, random_state=42)

# Train model
xgb_model.fit(X_train_c, y_train_c,
              eval_set=[(X_val_c, y_val_c)],
              verbose=True)

# predictions on the validation set
y_val_preds_c = xgb_model.predict(X_val_c)

f1 = f1_score(y_val_c, y_val_preds_c)

model_path = "../pkl_files/xgb_model_count.pkl" #save
dump(xgb_model, model_path)

# Return F1
f1

[0]	validation_0-logloss:0.66918
[1]	validation_0-logloss:0.65760
[2]	validation_0-logloss:0.64880
[3]	validation_0-logloss:0.64033
[4]	validation_0-logloss:0.63188
[5]	validation_0-logloss:0.62449
[6]	validation_0-logloss:0.61972
[7]	validation_0-logloss:0.61249
[8]	validation_0-logloss:0.60863
[9]	validation_0-logloss:0.60277
[10]	validation_0-logloss:0.59710
[11]	validation_0-logloss:0.59205
[12]	validation_0-logloss:0.58774
[13]	validation_0-logloss:0.58371
[14]	validation_0-logloss:0.57985
[15]	validation_0-logloss:0.57640
[16]	validation_0-logloss:0.57398
[17]	validation_0-logloss:0.57118
[18]	validation_0-logloss:0.56849
[19]	validation_0-logloss:0.56588
[20]	validation_0-logloss:0.56329
[21]	validation_0-logloss:0.56083
[22]	validation_0-logloss:0.55860
[23]	validation_0-logloss:0.55669
[24]	validation_0-logloss:0.55495
[25]	validation_0-logloss:0.55300
[26]	validation_0-logloss:0.55115
[27]	validation_0-logloss:0.55003
[28]	validation_0-logloss:0.54843
[29]	validation_0-loglos

0.7161290322580646

In [12]:
#Predictions for Submission
y_test_preds_c = xgb_model.predict(X_test_count)

# Create Submission DataFrame
submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds_c})

# Save Submission File
submission.to_csv("../csv_files/xgboost_count_submission.csv", index=False)

# Print first few rows for verification
print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       1
1   2       1
2   3       1
3   9       0
4  11       1


# COUNT VECTORIZER OPTIMIZED

In [13]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5.0)
    }

    model = XGBClassifier(**params, eval_metric='logloss')
    f1 = cross_val_score(model, X_train_c, y_train_c, cv=3, scoring='f1').mean()
    
    return f1

# Optimize with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Best parameters
print("Best Hyperparameters:", study.best_params)

best_params = study.best_params
xgb_model = XGBClassifier(**best_params, eval_metric='logloss')
xgb_model.fit(X_train_c, y_train_c)

dump(xgb_model, "../pkl_files/xgb_model_optimized_count.pkl")

[I 2025-02-24 15:26:10,139] A new study created in memory with name: no-name-9468c5e6-b548-4579-9da3-0bc47ef2c78f
[I 2025-02-24 15:26:15,538] Trial 0 finished with value: 0.7138454078947424 and parameters: {'n_estimators': 158, 'learning_rate': 0.2299293290126249, 'max_depth': 7, 'subsample': 0.8428736954912669, 'colsample_bytree': 0.8671639462157108, 'reg_alpha': 0.8135531602010828, 'reg_lambda': 3.0328280009149404}. Best is trial 0 with value: 0.7138454078947424.
[I 2025-02-24 15:26:32,048] Trial 1 finished with value: 0.7048645722572825 and parameters: {'n_estimators': 382, 'learning_rate': 0.15363480909974267, 'max_depth': 9, 'subsample': 0.7331949723881122, 'colsample_bytree': 0.7530441210577059, 'reg_alpha': 0.8941845470842565, 'reg_lambda': 2.504824976570476}. Best is trial 0 with value: 0.7138454078947424.
[I 2025-02-24 15:26:35,179] Trial 2 finished with value: 0.7156279014876977 and parameters: {'n_estimators': 208, 'learning_rate': 0.23659573516620258, 'max_depth': 5, 'subsa

Best Hyperparameters: {'n_estimators': 260, 'learning_rate': 0.2990938648240929, 'max_depth': 6, 'subsample': 0.9878636542786724, 'colsample_bytree': 0.6478015464987021, 'reg_alpha': 0.6465576519858698, 'reg_lambda': 1.0436671319531827}


['../pkl_files/xgb_model_optimized_count.pkl']

In [14]:
xgb_model = load("../pkl_files/xgb_model_optimized_count.pkl")

y_test_preds = xgb_model.predict(X_test_count)


submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})
submission.to_csv("../csv_files/xgboost_submission_optimized_count.csv", index=False)

print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       1
1   2       1
2   3       1
3   9       0
4  11       1
