In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# Path to the competition files
DATA_PATH = "/kaggle/input/classification-of-math-problems-by-kasut-academy"
TRAIN_PATH = os.path.join(DATA_PATH, "train.csv")
TEST_PATH = os.path.join(DATA_PATH, "test.csv")
SAMPLE_SUBMISSION_PATH = os.path.join(DATA_PATH, "sample_submission.csv")

# Load the datasets
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
sample_submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# Display some sample data
print("\nSample training data:")
print(train_df.head())

# Check label distribution
print("\nLabel distribution in training data:")
print(train_df['label'].value_counts().sort_index())

# Define topic mapping for reference
topic_mapping = {
    0: "Algebra",
    1: "Geometry and Trigonometry",
    2: "Calculus and Analysis",
    3: "Probability and Statistics",
    4: "Number Theory",
    5: "Combinatorics and Discrete Math",
    6: "Linear Algebra",
    7: "Abstract Algebra and Topology"
}

# Function to preprocess text data
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Replace mathematical symbols with their word equivalents
    symbol_map = {
        '+': ' plus ',
        '-': ' minus ',
        '=': ' equals ',
        '≤': ' less than or equal to ',
        '≥': ' greater than or equal to ',
        '<': ' less than ',
        '>': ' greater than ',
        '×': ' times ',
        '÷': ' divided by ',
        '≠': ' not equal to ',
        '≈': ' approximately equal to ',
        '∞': ' infinity ',
        '∫': ' integral ',
        '∑': ' sum ',
        '∏': ' product ',
        '∂': ' partial derivative ',
        '∇': ' nabla ',
        '∆': ' delta ',
        '∈': ' element of ',
        '∉': ' not element of ',
        '∩': ' intersection ',
        '∪': ' union ',
        '⊂': ' subset of ',
        '⊃': ' superset of ',
        '⊆': ' subset or equal to ',
        '⊇': ' superset or equal to ',
        '∅': ' empty set ',
        '∀': ' for all ',
        '∃': ' there exists ',
        '∄': ' there does not exist ',
        '∴': ' therefore ',
        '∵': ' because ',
        '∝': ' proportional to ',
        '°': ' degrees ',
        '∠': ' angle ',
        '∥': ' parallel to ',
        '⊥': ' perpendicular to ',
        '√': ' square root ',
        'π': ' pi ',
        'θ': ' theta ',
        'α': ' alpha ',
        'β': ' beta ',
        'γ': ' gamma ',
        'δ': ' delta ',
        'ε': ' epsilon ',
        'λ': ' lambda ',
        'μ': ' mu ',
        'σ': ' sigma ',
        'τ': ' tau ',
        'φ': ' phi ',
        'ω': ' omega ',
    }
    
    for symbol, replacement in symbol_map.items():
        text = text.replace(symbol, replacement)
    
    # Handle superscripts (x², x³, etc.)
    text = re.sub(r'(\d+)²', r'\1 squared', text)
    text = re.sub(r'(\d+)³', r'\1 cubed', text)
    text = re.sub(r'(\d+)\^(\d+)', r'\1 raised to the power of \2', text)
    
    # Handle subscripts and special notations
    text = re.sub(r'_(\d+)', r' subscript \1', text)
    
    # Remove special characters and digits (keeping spaces)
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    # Keep some mathematical terms that might be in stopwords
    math_terms = {'not', 'and', 'or', 'if', 'then', 'all', 'any', 'no', 'sum', 'what', 'which', 'how', 'many'}
    filtered_tokens = [word for word in tokens if word not in stop_words or word in math_terms]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(lemmatized_tokens)
    
    return preprocessed_text

# Apply preprocessing to both train and test data
print("\nPreprocessing text data...")
train_df['processed_text'] = train_df['Question'].apply(preprocess_text)
test_df['processed_text'] = test_df['Question'].apply(preprocess_text)

# Visualize the data
plt.figure(figsize=(10, 6))
sns.countplot(x='label', data=train_df)
plt.title('Distribution of Math Topics in Training Data')
plt.xlabel('Topic')
plt.ylabel('Count')
plt.xticks(ticks=range(8), labels=[topic_mapping[i] for i in range(8)], rotation=45)
plt.tight_layout()
plt.savefig('topic_distribution.png')
plt.close()

# Calculate average word count per topic
train_df['word_count'] = train_df['Question'].apply(lambda x: len(str(x).split()))
plt.figure(figsize=(12, 6))
sns.boxplot(x='label', y='word_count', data=train_df)
plt.title('Word Count Distribution by Topic')
plt.xlabel('Topic')
plt.ylabel('Word Count')
plt.xticks(ticks=range(8), labels=[topic_mapping[i] for i in range(8)], rotation=45)
plt.tight_layout()
plt.savefig('word_count_by_topic.png')
plt.close()

# Feature Engineering
print("\nExtracting features...")

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 3),
    min_df=3,
    max_df=0.9
)

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['processed_text'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['processed_text'])

# Create additional features
def extract_math_features(text):
    features = {}
    
    # Check for specific mathematical keywords
    features['has_equation'] = int('equation' in text.lower() or '=' in text)
    features['has_inequality'] = int(any(term in text.lower() for term in ['inequality', '<', '>', '≤', '≥', '≠']))
    features['has_probability'] = int(any(term in text.lower() for term in ['probability', 'random', 'chance', 'likelihood', 'sample']))
    features['has_geometry'] = int(any(term in text.lower() for term in ['angle', 'triangle', 'circle', 'square', 'polygon', 'rectangle', 'line', 'point', 'vertex', 'edge', 'face', 'volume', 'area', 'perimeter']))
    features['has_calculus'] = int(any(term in text.lower() for term in ['integral', 'derivative', 'differentiate', 'integrate', 'limit', 'converge', 'diverge', 'maximum', 'minimum', 'extremum', 'inflection']))
    features['has_algebra'] = int(any(term in text.lower() for term in ['solve', 'equation', 'expression', 'factor', 'simplify', 'polynomial', 'roots', 'solution']))
    features['has_matrix'] = int(any(term in text.lower() for term in ['matrix', 'vector', 'determinant', 'inverse', 'eigenvalue', 'eigenvector', 'linear', 'transformation', 'basis']))
    features['has_number_theory'] = int(any(term in text.lower() for term in ['prime', 'divisor', 'factor', 'gcd', 'lcm', 'modulo', 'congruence', 'remainder', 'divisible']))
    features['has_set'] = int(any(term in text.lower() for term in ['set', 'subset', 'element', 'union', 'intersection', 'difference', 'complement']))
    features['has_combinatorics'] = int(any(term in text.lower() for term in ['combination', 'permutation', 'arrangement', 'choose', 'factorial', 'count', 'ways', 'possible']))
    
    # Check for mathematical symbols
    features['has_fraction'] = int('/' in text or '÷' in text)
    features['has_exponent'] = int('^' in text or any(c in text for c in ['²', '³']))
    features['has_trigonometry'] = int(any(term in text.lower() for term in ['sin', 'cos', 'tan', 'sec', 'cosec', 'cot', 'sine', 'cosine', 'tangent', 'angle', 'degree', 'radian']))
    
    # Text length features
    features['text_length'] = len(text)
    features['word_count'] = len(text.split())
    
    # Question type features
    features['is_find_question'] = int(text.lower().startswith('find'))
    features['is_calculate_question'] = int(text.lower().startswith('calculate'))
    features['is_determine_question'] = int(text.lower().startswith('determine'))
    features['is_prove_question'] = int(text.lower().startswith('prove'))
    features['is_show_question'] = int(text.lower().startswith('show'))
    
    return features

# Extract additional features
train_features = pd.DataFrame([extract_math_features(text) for text in train_df['Question']])
test_features = pd.DataFrame([extract_math_features(text) for text in test_df['Question']])

# Combine with TF-IDF features
from scipy.sparse import hstack
X_train_combined = hstack([X_train_tfidf, train_features.values])
X_test_combined = hstack([X_test_tfidf, test_features.values])

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X_train_combined, train_df['label'], 
    test_size=0.2, 
    random_state=42, 
    stratify=train_df['label']
)

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")

# Model Selection and Training
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Define models to try
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, C=1.0, solver='saga', n_jobs=-1, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=20, n_jobs=-1, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, n_jobs=-1, random_state=42),
    'LightGBM': LGBMClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, n_jobs=-1, random_state=42)
}

# Train and evaluate models
print("\nTraining and evaluating models:")
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred = model.predict(X_val)
    
    # Evaluate model
    accuracy = accuracy_score(y_val, y_pred)
    results[name] = accuracy
    
    print(f"{name} Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred, target_names=[topic_mapping[i] for i in range(8)]))
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    cm = confusion_matrix(y_val, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=[topic_mapping[i] for i in range(8)],
                yticklabels=[topic_mapping[i] for i in range(8)])
    plt.title(f'Confusion Matrix - {name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.xticks(rotation=45)
    plt.yticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'confusion_matrix_{name.replace(" ", "_").lower()}.png')
    plt.close()

# Plot model comparison
plt.figure(figsize=(10, 6))
models_df = pd.DataFrame({'Model': list(results.keys()), 'Accuracy': list(results.values())})
sns.barplot(x='Model', y='Accuracy', data=models_df)
plt.title('Model Comparison')
plt.xlabel('Model')
plt.ylabel('Validation Accuracy')
plt.ylim(0.6, 1.0)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('model_comparison.png')
plt.close()

# Choose the best model based on validation performance
best_model_name = max(results, key=results.get)
print(f"\nBest model: {best_model_name} with accuracy: {results[best_model_name]:.4f}")

# Train the best model on all training data
print(f"\nTraining {best_model_name} on all training data...")
best_model = models[best_model_name]
best_model.fit(X_train_combined, train_df['label'])

# Make predictions on the test set
print("\nMaking predictions on the test set...")
test_predictions = best_model.predict(X_test_combined)

# Create submission file
sample_submission['label'] = test_predictions
sample_submission.to_csv('submission.csv', index=False)
print("Submission file created successfully.")

# Feature importance analysis (if applicable)
if best_model_name in ['Random Forest', 'XGBoost', 'LightGBM']:
    # Get feature importances
    feature_names = tfidf_vectorizer.get_feature_names_out().tolist() + train_features.columns.tolist()
    
    if best_model_name == 'Random Forest':
        importances = best_model.feature_importances_
    else:
        importances = best_model.feature_importances_
    
    # Show top 30 features (or fewer if there aren't that many)
    top_n = min(30, len(feature_names))
    
    # Get indices of top features
    if len(importances) > top_n:
        indices = np.argsort(importances)[-top_n:]
    else:
        indices = np.argsort(importances)
    
    plt.figure(figsize=(12, 8))
    plt.title(f'Top {top_n} Feature Importances - {best_model_name}')
    plt.barh(range(len(indices)), importances[indices], align='center')
    try:
        plt.yticks(range(len(indices)), [feature_names[i] if i < len(feature_names) else f"Feature {i}" for i in indices])
    except IndexError:
        # If there's an issue with feature names, use generic labels
        plt.yticks(range(len(indices)), [f"Feature {i}" for i in indices])
    plt.xlabel('Relative Importance')
    plt.tight_layout()
    plt.savefig(f'feature_importance_{best_model_name.replace(" ", "_").lower()}.png')
    plt.close()
    
    print(f"\nTop {top_n} important features saved to feature_importance_{best_model_name.replace(' ', '_').lower()}.png")

# Enhanced approach with text embeddings and model stacking
print("\nEnhancing model with text embeddings and stacking...")

# Use transformer-based model for embeddings
try:
    from sentence_transformers import SentenceTransformer
    
    # Load a lightweight sentence transformer model
    print("Loading SentenceTransformer model...")
    embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    
    # Create text embeddings
    print("Creating text embeddings...")
    train_embeddings = embedder.encode(train_df['Question'].tolist(), show_progress_bar=True)
    test_embeddings = embedder.encode(test_df['Question'].tolist(), show_progress_bar=True)
    
    # Split embeddings for training and validation
    train_emb, val_emb = train_test_split(
        train_embeddings, test_size=0.2, random_state=42, 
        stratify=train_df['label']
    )
    
    # Create a stacked model
    from sklearn.ensemble import StackingClassifier
    
    # Define base models
    base_models = [
        ('lr', LogisticRegression(max_iter=1000, C=1.0, solver='saga', random_state=42)),
        ('rf', RandomForestClassifier(n_estimators=100, max_depth=15, random_state=42)),
        ('xgb', XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42))
    ]
    
    # Define meta-learner
    meta_learner = LGBMClassifier(n_estimators=100, random_state=42)
    
    # Create stacking model
    stacking_model = StackingClassifier(
        estimators=base_models,
        final_estimator=meta_learner,
        cv=5,
        n_jobs=-1
    )
    
    # Train stacking model on embeddings
    print("Training stacking model on embeddings...")
    stacking_model.fit(train_emb, train_df.iloc[:len(train_emb)]['label'])
    
    # Evaluate on validation set
    val_pred = stacking_model.predict(val_emb)
    val_accuracy = accuracy_score(train_df.iloc[len(train_emb):]['label'].values, val_pred)
    print(f"Stacking model validation accuracy: {val_accuracy:.4f}")
    
    # Make predictions on test set
    test_pred_stack = stacking_model.predict(test_embeddings)
    
    # Create submission file for stacking model
    stack_submission = sample_submission.copy()
    stack_submission['label'] = test_pred_stack
    stack_submission.to_csv('stack_submission.csv', index=False)
    print("Stacking model submission file created successfully.")
    
    # Compare with previous best model
    if val_accuracy > results[best_model_name]:
        print(f"Stacking model outperforms {best_model_name}. Using stacking model predictions.")
        final_submission = stack_submission
        final_submission.to_csv('final_submission.csv', index=False)
    else:
        print(f"{best_model_name} still performs better. Using {best_model_name} predictions.")
        final_submission = sample_submission
        final_submission.to_csv('final_submission.csv', index=False)
    
except ImportError:
    print("SentenceTransformer package not available. Using the best model from previous evaluation.")
    final_submission = sample_submission
    final_submission.to_csv('final_submission.csv', index=False)

print("\nFinal submission file created successfully.")

# Create an ensemble of the best models
print("\nCreating an ensemble of the best models...")

# Define the best models for ensemble
best_models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, C=1.0, solver='saga', n_jobs=-1, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=20, n_jobs=-1, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, n_jobs=-1, random_state=42),
    'LightGBM': LGBMClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, n_jobs=-1, random_state=42)
}

# Train models and collect predictions
ensemble_predictions = np.zeros((X_test_combined.shape[0], len(best_models)))

for i, (name, model) in enumerate(best_models.items()):
    print(f"Training {name} for ensemble...")
    model.fit(X_train_combined, train_df['label'])
    ensemble_predictions[:, i] = model.predict(X_test_combined)

# Use majority voting for ensemble predictions
from scipy.stats import mode
final_ensemble_predictions = mode(ensemble_predictions, axis=1)[0].flatten().astype(int)

# Create ensemble submission file
ensemble_submission = sample_submission.copy()
ensemble_submission['label'] = final_ensemble_predictions
ensemble_submission.to_csv('ensemble_submission.csv', index=False)
print("Ensemble submission file created successfully.")

print("\nAll processes completed successfully!")

Train shape: (10189, 2)
Test shape: (3044, 2)

Sample training data:
                                            Question  label
0  A solitaire game is played as follows.  Six di...      3
1  2. The school table tennis championship was he...      5
2  Given that $x, y,$ and $z$ are real numbers th...      0
3  $25 \cdot 22$ Given three distinct points $P\l...      1
4  I am thinking of a five-digit number composed ...      5

Label distribution in training data:
label
0    2618
1    2439
2    1039
3     368
4    1712
5    1827
6     100
7      86
Name: count, dtype: int64

Preprocessing text data...

Extracting features...
Training data shape: (8151, 10020)
Validation data shape: (2038, 10020)

Training and evaluating models:

Training Logistic Regression...
Logistic Regression Accuracy: 0.4465

Classification Report:
                                 precision    recall  f1-score   support

                        Algebra       0.52      0.50      0.51       524
      Geometry and Trig

2025-05-03 17:22:38.568404: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746292958.835489      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746292958.905734      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading SentenceTransformer model...


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating text embeddings...


Batches:   0%|          | 0/319 [00:00<?, ?it/s]

Batches:   0%|          | 0/96 [00:00<?, ?it/s]

Training stacking model on embeddings...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002080 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6120
[LightGBM] [Info] Number of data points in the train set: 8151, number of used features: 24
[LightGBM] [Info] Start training from score -1.350979
[LightGBM] [Info] Start training from score -1.429799
[LightGBM] [Info] Start training from score -2.295373
[LightGBM] [Info] Start training from score -3.329142
[LightGBM] [Info] Start training from score -1.783330
[LightGBM] [Info] Start training from score -1.717651
[LightGBM] [Info] Start training from score -4.701831
[LightGBM] [Info] Start training from score -4.771789
Stacking model validation accuracy: 0.2282
Stacking model submission file created successfully.
LightGBM still performs better. Using LightGBM predictions.

Final submission file created successfully.

Creating an ensemble of the best models...
Training Logistic Regression for en