In [None]:
# Install CatBoost (Run this only once)
# pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting matplotlib (from catboost)
  Downloading matplotlib-3.10.3-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting plotly (from catboost)
  Downloading plotly-6.2.0-py3-none-any.whl.metadata (8.5 kB)
Collecting contourpy>=1.0.1 (from matplotlib->catboost)
  Downloading contourpy-1.3.3-cp312-cp312-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib->catboost)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib->catboost)
  Downloading fonttools-4.59.0-cp312-cp312-win_amd64.whl.metadata (110 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib->catboost)
  Downloading kiwisolver-1.4.8-cp312-cp312-win_amd64.whl.metadata (6.3 kB)
Collecting narwhals>=1.15.1 (from plotly->catboost)
  Downloading narwhals-2.0.1-py3-none-a


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Import necessary libraries
import pandas as pd
from catboost import CatBoostClassifier

df = pd.read_csv('data/train_tfidf_features.csv') #Load the training dataset

In [3]:
# Split training dataset into training and validation sets, 80:20 split
train_df = df.iloc[:int(len(df)*0.8), :]
test_df = df.iloc[int(len(df)*0.8):, :]

X_train, y_train = train_df.drop(columns=["id", "label"]), train_df["label"] # Features and labels for training (drop 'id' from features)
X_test, y_test = test_df.drop(columns=["id", "label"]), test_df["label"] # Features and labels for validation (drop 'id' from features)

In [4]:
# Creeate a class object for CatBoost Classifier (to make it reusable and modular)
class CatBoostModel:
    def __init__(self, learning_rate=0.1, iterations=100, depth=6, random_seed=42, class_weights=None, verbose=0):
        self.learning_rate = learning_rate # Step size shrinkage used in boostin (lower val = slower training but better performance)
        self.iterations = iterations # Number of boosting iterations (trees)
        self.depth = depth # Depth of the trees (higher val = more complex model)
        self.random_seed = random_seed # Random seed for reproducibility
        self.class_weights = class_weights # Class weights for handling class imbalance (penalizes misclassification of minority class)
        self.verbose = verbose # Verbosity level (0 = silent, higher = more output)
        self.model = None

    def train(self, X_train, y_train): # Train the CatBoost model with the provided parameters
        self.model = CatBoostClassifier(
            learning_rate=self.learning_rate,
            iterations=self.iterations,
            depth=self.depth,
            random_seed=self.random_seed,
            class_weights=self.class_weights,
            verbose=self.verbose
        )
        self.model.fit(X_train, y_train) # Fit the model to the training data and learn the patterns
        return self.model

    def predict(self, X_test): # Predict class label using the trained model for new data 
        return self.model.predict(X_test)

    def predict_probability(self, X_test): # Predict class probabilities (can be useful for stacking ensembles or threshold tuning)
        return self.model.predict_probability(X_test)
    
catboost_model = CatBoostModel(
    learning_rate=0.1,
    iterations=300,
    depth=6,
    class_weights={0: 0.81, 1: 1.31},  # match logistic regression weights
    verbose=0
)

catboost_model.train(X_train, y_train)
y_pred = catboost_model.predict(X_test)


In [5]:
# Evaluate the model and display classification report metrics
from sklearn import metrics

print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.75      0.76      2126
           1       0.62      0.66      0.64      1311

    accuracy                           0.72      3437
   macro avg       0.70      0.71      0.70      3437
weighted avg       0.72      0.72      0.72      3437

