In [None]:
import sys
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load the train data
train = pd.read_csv('/kaggle/input/classification-of-math-problems-by-kasut-academy/train.csv')

In [None]:
train.head()

The questions are LaTex formatted. They need to be coverted to plain text.

In [None]:
from pylatexenc.latex2text import LatexNodes2Text

# Convert LaTeX-formatted strings in the 'Question' column to plain text using pylatexenc.
train['Question_Text'] = train['Question'].apply(LatexNodes2Text().latex_to_text)

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# This function performs text preprocessing by: removing periods, tokenizing the text, converting tokens to lowercase, removing stopwords and lemmatizing tokens
# The cleaned tokens are then rejoined into a single string.

def clean_text(text, remove_stopwords=True, lemmatize=True):
    # Remove only the period character
    text = re.sub(r'\.', '', text)

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Lowercase
    tokens = [token.lower() for token in tokens]

    # Remove stopwords
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Re-join tokens into a string
    return ' '.join(tokens)

In [None]:
train['cleaned_question'] = train['Question_Text'].apply(clean_text)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert cleaned text into TF-IDF feature vectors using unigrams, bigrams, and trigrams.
# Limit the feature space to the top 100,000 terms based on term frequency across the corpus.
vectorizer = TfidfVectorizer(ngram_range=(1,3), max_features=100000)
X = vectorizer.fit_transform(train['cleaned_question'])

In [None]:
y = train.label

In [None]:
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

In [None]:
# This Optuna optimization search is designed to tune hyperparameters for a LightGBM model on a multiclass classification task. 
# It uses Stratified K-Fold cross-validation to for evaluation and aims to maximize the micro-averaged F1 score.

'''import lightgbm as lgb

def objective_lgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1200),
        'max_depth': trial.suggest_int('max_depth', 3, 14),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.65, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'objective': 'multiclass',
        'verbosity': -1,
        'random_state': 2,
        'num_class': len(np.unique(y)),
    }

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
    f1_scores = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = lgb.LGBMClassifier(**params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        score = f1_score(y_val, y_pred, average='micro')
        f1_scores.append(score)

    return np.mean(f1_scores)

study_lgb = optuna.create_study(direction='maximize', study_name="lgbm_optuna")
study_lgb.optimize(objective_lgb, n_trials=100)

print("Best trial:")
print(study_lgb.best_trial)'''

Best hyperparameters obtained:

{'n_estimators': 599, 'max_depth': 11, 'learning_rate': 0.032138984780093804, 'subsample': 0.7293348259364973, 'colsample_bytree': 0.7566540444923334, 'reg_lambda': 0.021337707070698972, 'reg_alpha': 0.06836150777087176}

In [None]:
# This Optuna optimization search tunes the smoothing parameter `alpha` for the Multinomial Naive Bayes model. 
# It uses Stratified K-Fold cross-validation for evaluation and aims to maximize the micro-averaged F1 score. 
# The goal is to find the best regularization strength to balance bias and variance in probabilistic text classification.

'''from sklearn.naive_bayes import MultinomialNB

def objective_nb(trial):
    alpha = trial.suggest_float('alpha', 1e-3, 10.0, log=True)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
    f1_scores = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = MultinomialNB(alpha=alpha)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        score = f1_score(y_val, y_pred, average='micro')
        f1_scores.append(score)

    return np.mean(f1_scores)

study_nb = optuna.create_study(direction='maximize', study_name="nb_optuna")
study_nb.optimize(objective_nb, n_trials=50)

print("Best trial:")
print(study_nb.best_trial)'''

Best alpha value:

alpha = 0.061600028604369125

In [None]:
# This Optuna search tunes the regularization parameter `C` for the Linear Support Vector Classifier.
# It uses Stratified K-Fold cross-validation for evaluation and aims to maximize the micro-averaged F1 score. 

'''from sklearn.svm import LinearSVC

def objective_svm(trial):
    C = trial.suggest_float('C', 1e-3, 100.0, log=True)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)
    f1_scores = []

    for train_idx, val_idx in skf.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = LinearSVC(C=C, max_iter=10000, random_state=2)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        score = f1_score(y_val, y_pred, average='micro')
        f1_scores.append(score)

    return np.mean(f1_scores)

study_svm = optuna.create_study(direction='maximize', study_name="svm_optuna")
study_svm.optimize(objective_svm, n_trials=50)

print("Best trial:")
print(study_svm.best_trial)'''

Best C value:

C = 14.534588753905727

In [None]:
import lightgbm as lgb

# Define a LightGBM model with the best hyperparameters obtained.
params={'n_estimators': 599, 'max_depth': 11, 'learning_rate': 0.032138984780093804, 'subsample': 0.7293348259364973, 'colsample_bytree': 0.7566540444923334, 'reg_lambda': 0.021337707070698972, 'reg_alpha': 0.06836150777087176, 'verbosity': -1}
lgbm = lgb.LGBMClassifier(**params)

In [None]:
from sklearn.svm import LinearSVC

# Define a Linear Support Vector Classifier with the best C.
C = 14.534588753905727
lsvc = LinearSVC(C=C, max_iter=10000, random_state=2)

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Define a Multinomial Naive Bayes model with the best alpha.
alpha = 0.061600028604369125
mnb = MultinomialNB(alpha=alpha)

In [None]:
from sklearn.ensemble import VotingClassifier

# Create an ensemble model using the classifiers defined.
model = VotingClassifier(
    estimators=[
        ('lgbm', lgbm),
        ('svm', lsvc),
        ('mnb', mnb)
    ],
)

# Fit on training data
model.fit(X, y)

In [None]:
# Load the test data
test = pd.read_csv('/kaggle/input/classification-of-math-problems-by-kasut-academy/test.csv')

In [None]:
# Convert LaTeX-formatted questions in the test set to plain text 
# then clean the resulting text by applying the same preprocessing as done for the training data.
test['Question_Text'] = test['Question'].apply(LatexNodes2Text().latex_to_text)
test['cleaned_question'] = test['Question_Text'].apply(clean_text)

In [None]:
# Transform the cleaned test questions into TF-IDF feature vectors using the fitted vectorizer
X_test = vectorizer.transform(test['cleaned_question'])

In [None]:
# Predict the labels for the test set using the trained ensemble model
preds = model.predict(X_test)
test['label'] = preds

# Save the 'id' and predicted 'label' columns to a CSV file for submission
test[['id','label']].to_csv('submission.csv', index=False)