In [12]:
from joblib import load, dump
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import pandas as pd
import os

# Load from the parent directory (one level up)
X_train_tfidf = load("X_train_tfidf.joblib")
y_train_tfidf = load("y_train_tfidf.joblib")

X_test_tfidf = load("X_test_tfidf.joblib")

# Load test dataset to get 'id' column
test_data = pd.read_csv("Data/test.csv")  # Ensure the correct path

# Split into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train_tfidf, y_train_tfidf, test_size=0.2, random_state=42)

# Define the XGBoost model with initial hyperparameters
xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=1.0,
    eval_metric='logloss'  # Keep this
)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions on the validation set
y_val_preds = xgb_model.predict(X_val)

# Calculate F1 Score
f1 = f1_score(y_val, y_val_preds)

# Save the trained model
model_path = "xgb_model.joblib"
dump(xgb_model, model_path)

# Return F1 Score
f1

0.6782133090246126

In [11]:
# ✅ Generate Predictions for Submission
y_test_preds = xgb_model.predict(X_test_tfidf)

# ✅ Match test_data to X_test_tfidf in size (remove extra rows if necessary)
test_data_filtered = test_data.iloc[:len(y_test_preds)].copy()

# ✅ Create Submission DataFrame
submission = pd.DataFrame({'id': test_data_filtered['id'], 'target': y_test_preds})

# ✅ Save Submission File
submission.to_csv("xgboost_tfidf_submission.csv", index=False)

# ✅ Print first few rows for verification
print("Submission file preview:")
print(submission.head())

Submission file preview:
   id  target
0   0       0
1   2       0
2   3       1
3   9       0
4  11       0
