In [1]:
import gzip
import json
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, precision_recall_curve
from sklearn.metrics import auc
import numpy as np
import warnings
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer


In [2]:

train_df = pd.read_csv("task1/train_set.csv")
test_df = pd.read_csv("task1/test_set.csv")


In [3]:
train_df = train_df.groupby(['transcript_id','position','kmer','gene_id']).mean(['feat1','feat2','feat3','feat4','feat5','feat6','feat7','feat8','feat9','label']).reset_index()
test_df = test_df.groupby(['transcript_id','position','kmer','gene_id']).mean(['feat1','feat2','feat3','feat4','feat5','feat6','feat7','feat8','feat9','label']).reset_index()

In [4]:
full_df = pd.concat([train_df,test_df])
len(full_df[full_df['label'] == 0])/len(full_df[full_df['label'] == 1])

21.25351598173516

In [5]:
feature_cols = [f"feat{i}" for i in range(1, 10)]
X_train = train_df[feature_cols]
y_train = train_df["label"]
X_test = test_df[feature_cols]
y_test = test_df["label"]

In [6]:

warnings.filterwarnings("ignore")
model = xgb.XGBClassifier(
    objective='binary:logistic',
    random_state=42,
    eval_metric='logloss',  # base metric for training
    use_label_encoder=False,
    scale_pos_weight=21
)
param_grid = {
    'n_estimators': [200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8]
}
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,                  # 5-fold cross-validation
    verbose=2,
    n_jobs=-1              # use all CPU cores
)
grid_search.fit(X_train, y_train)

print("Best AUCROC: {:.4f}".format(grid_search.best_score_))
print("Best Parameters:", grid_search.best_params_)

# Retrieve best model
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=200; total time=   0.8s
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=200; total time=   0.8s
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=200; total time=   0.8s
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=200; total time=   0.8s
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=200; total time=   0.9s
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=300; total time=   1.0s
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=300; total time=   1.1s
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=300; total time=   1.2s
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=300; total time=   1.2s
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=300; total time=   1.3s
[CV] END ..learning_rate=0.01, max_depth=4, n_estimators=500; total time=   1.8s
[CV] END ..learning_rate=0.01, max_depth=4, n_e

In [7]:
full_model = xgb.XGBClassifier(
    n_estimators=500,
    learning_rate=0.01,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss",
    scale_pos_weight=21
)

feature_cols = [f"feat{i}" for i in range(1, 10)]
x_full = full_df[feature_cols]
y_full = full_df['label']
full_model.fit(x_full, y_full)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [8]:
y_score = full_model.predict_proba(x_full)[:,1]
auc_score = roc_auc_score(y_full, y_score)
auc_score

0.9590297736609698

In [9]:
precision, recall, thresholds = precision_recall_curve(y_full, y_score)
# Use AUC function to calculate the area under the curve of precision recall curve
auc_precision_recall = auc(recall, precision)
auc_precision_recall

0.548550171270171

In [10]:
import joblib

# Save the model to a file
joblib.dump(full_model, "full_xgb_model.pkl")


['full_xgb_model.pkl']

In [11]:
import os
import joblib
import pandas as pd

def predict_new_data(input_csv):
    # Load the model (no encoder needed now)
    model = joblib.load("full_xgb_model.pkl")
    
    # Load and prepare data (no label, no gene_id, drop kmer)
    df = pd.read_csv(input_csv)
    
    # Pool features by mean, grouped by transcript_id and position
    feature_cols = [f"feat{i}" for i in range(1, 10)]
    agg_funcs = {col: "mean" for col in feature_cols}
    
    grouped = df.groupby(["transcript_id", "position"], as_index=False).agg(agg_funcs)
    
    # Features for prediction
    X_new = grouped[feature_cols]
    
    # Predict probabilities for positive class
    preds = model.predict_proba(X_new)[:, 1]
    
    # Format output
    result = pd.DataFrame({
        "transcript_id": grouped["transcript_id"],
        "transcript_position": grouped["position"],
        "score": preds
    })
    
    # Save output CSV
    os.makedirs("xgboost_predictions", exist_ok=True)
    out_path = f"xgboost_predictions/predictions_{os.path.basename(input_csv)}"
    result.to_csv(out_path, index=False)
    print(f"Predictions saved to: {out_path}")


In [12]:
# List of input files
files = ["task1/parsed_dataset2.csv"]

# Run predictions for each file
for f in files:
    print(f"ðŸ”¹ Running predictions on {f}...")
    predict_new_data(f)

ðŸ”¹ Running predictions on parsed_dataset2.csv...
Predictions saved to: xgboost_predictions/predictions_parsed_dataset2.csv
