# TFIDF XGBoost Model

In [23]:
from joblib import load, dump
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
import os

# Load TFIDF Cleaned Data
X_train_tfidf = load("model_train_tfidf.pkl")
y_train_tfidf = load("target.pkl")

X_test_tfidf = load("model_test_tfidf.pkl")

test_data = pd.read_csv("Data/test.csv") # Load test dataset to get 'id' column

# Split into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, y_train_tfidf, test_size=0.2, random_state=42)

# Define the XGBoost model with initial hyperparameters
xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=1.0,
    eval_metric='logloss' 
)

# Train model
xgb_model.fit(X_train, y_train)

# predictions on the validation set
y_val_preds = xgb_model.predict(X_val)

# F1 Score
f1 = f1_score(y_val, y_val_preds)

# Save the trained model
model_path = "xgb_model_tfidf.pkl" 
dump(xgb_model, model_path)

# Return F1 Score
f1

0.7069243156199678

In [24]:
# Generate Predictions for Submission
y_test_preds = xgb_model.predict(X_test_tfidf)

# Match test_data to X_test_tfidf in size (remove extra rows if necessary)
test_data_filtered = test_data.iloc[:len(y_test_preds)].copy()

# Create Submission DataFrame
submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})

# Save Submission File
submission.to_csv("xgboost_tfidf_submission.csv", index=False)

# Print first few rows for verification
print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       0
1   2       1
2   3       1
3   9       0
4  11       1


# Count Vectorizer XGBoost Model

In [25]:
# Load Count Vectorizer Cleaned Data
X_train_count = load("model_train_count.pkl")
y_train_count = load("target.pkl")

X_test_count = load("model_test_count.pkl")

test_data = pd.read_csv("Data/test.csv") # Load test dataset to get 'id' column 

# Split into training and validation sets (80% train, 20% validation)
X_train_c, X_val_c, y_train_c, y_val_c = train_test_split(X_train_count, y_train_count, test_size=0.2, random_state=42)

# Define the XGBoost model with initial hyperparameters
xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=1.0,
    eval_metric='logloss'  
)

# Train model
xgb_model.fit(X_train_c, y_train_c)

# predictions on the validation set
y_val_preds_c = xgb_model.predict(X_val_c)

# F1 Score
f1 = f1_score(y_val_c, y_val_preds_c)

# Save the trained model
model_path = "xgb_model_count.pkl"
dump(xgb_model, model_path)

# Return F1 Score
f1

0.719482619240097

In [26]:
# Generate Predictions for Submission
y_test_preds_c = xgb_model.predict(X_test_count)

# Match test_data to X_test_tfidf in size (remove extra rows if necessary)
test_data_filtered = test_data.iloc[:len(y_test_preds_c)].copy()

# Create Submission DataFrame
submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds_c})

# Save Submission File
submission.to_csv("xgboost_count_submission.csv", index=False)

# Print first few rows for verification
print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       1
1   2       1
2   3       1
3   9       0
4  11       1
