In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.preprocessing import StandardScaler, LabelEncoder  
from scipy.sparse import hstack, csr_matrix
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


In [3]:
df = pd.read_csv('../data/problems.csv')
df = df.fillna('')

print(f"Loaded {len(df)} problems")
print(f"Class distribution:")
print(df['problem_class'].value_counts())

Loaded 4112 problems
Class distribution:
problem_class
hard      1941
medium    1405
easy       766
Name: count, dtype: int64


In [4]:
def extract_features(row):
    """Extract comprehensive features from problem text"""
    text = (row['description'] + ' ' + 
            row['input_description'] + ' ' + 
            row['output_description']).lower()
    title_text = row['title'].lower()
    
    features = {}

    features['total_length'] = len(text)
    features['word_count'] = len(text.split())
    features['desc_length'] = len(row['description'])
    features['input_desc_length'] = len(row['input_description'])
    features['output_desc_length'] = len(row['output_description'])
    features['title_length'] = len(title_text)

    total_len = features['total_length']
    features['desc_ratio'] = features['desc_length'] / total_len if total_len > 0 else 0
    features['input_ratio'] = features['input_desc_length'] / total_len if total_len > 0 else 0
    features['output_ratio'] = features['output_desc_length'] / total_len if total_len > 0 else 0

    words = text.split()
    features['unique_words'] = len(set(words))
    features['unique_ratio'] = len(set(words)) / len(words) if words else 0
    features['avg_word_length'] = np.mean([len(w) for w in words]) if words else 0
    features['max_word_length'] = max([len(w) for w in words]) if words else 0
    features['min_word_length'] = min([len(w) for w in words]) if words else 0

    algo_keywords = {
        'graph': ['graph', 'tree', 'node', 'edge', 'vertex', 'path', 'cycle', 'dag', 
                  'directed', 'undirected', 'adjacency', 'connected', 'component', 'spanning'],
        'dp': ['dynamic', 'dp', 'memoization', 'optimal substructure', 'overlapping', 
               'state', 'transition', 'recurrence', 'tabulation'],
        'greedy': ['greedy', 'greedily', 'locally optimal'],
        'binary_search': ['binary search', 'bisection', 'lower bound', 'upper bound', 'log n', 'logarithmic'],
        'sorting': ['sort', 'sorted', 'sorting', 'order', 'arrange', 'quicksort', 'mergesort'],
        'dfs_bfs': ['dfs', 'bfs', 'depth first', 'breadth first', 'traversal', 'explore', 'visit'],
        'shortest_path': ['shortest path', 'dijkstra', 'bellman', 'floyd', 'shortest distance', 'minimum distance'],
        'advanced_ds': ['segment tree', 'fenwick', 'trie', 'suffix array', 'union find', 
                        'disjoint set', 'sparse table', 'bit', 'binary indexed'],
        'flow': ['max flow', 'min cut', 'network flow', 'bipartite matching', 'ford fulkerson'],
        'string_algo': ['substring', 'palindrome', 'kmp', 'lcs', 'edit distance', 'pattern', 
                        'prefix', 'suffix', 'character', 'anagram', 'rabin karp'],
        'number_theory': ['modulo', 'prime', 'gcd', 'lcm', 'factorial', 'coprime', 'euler', 
                          'divisor', 'multiple', 'sieve', 'totient'],
        'combinatorics': ['permutation', 'combination', 'binomial', 'catalan', 'choose', 'arrangement'],
        'probability': ['probability', 'expected value', 'random', 'expectation', 'distribution'],
        'geometry': ['geometry', 'coordinate', 'polygon', 'convex hull', 'point', 
                     'line', 'angle', 'distance', 'euclidean', 'manhattan'],
        'matrix': ['matrix', 'matrices', 'grid', 'board', '2d array'],
        'two_pointer': ['two pointer', 'sliding window', 'subarray', 'contiguous', 'window'],
        'bit_manipulation': ['bit', 'xor', 'and', 'or', 'bitwise', 'binary representation', 'shift'],
        'backtracking': ['backtrack', 'generate all', 'all possible', 'enumerate', 'brute force', 'recursive'],
        'divide_conquer': ['divide and conquer', 'divide', 'merge', 'split'],
        'heap': ['heap', 'priority queue', 'heapify'],
        'stack_queue': ['stack', 'queue', 'deque', 'fifo', 'lifo'],
        'hashing': ['hash', 'hashmap', 'hashtable', 'dictionary', 'map'],
        'simulation': ['simulate', 'simulation', 'process', 'step by step']
    }
    
    for key, keywords in algo_keywords.items():
        features[f'has_{key}'] = int(any(kw in text for kw in keywords))
        features[f'{key}_count'] = sum(text.count(kw) for kw in keywords)

    title_keywords = {
        'easy_title': ['simple', 'basic', 'easy', 'count', 'sum', 'find'],
        'medium_title': ['find', 'calculate', 'compute', 'determine'],
        'hard_title': ['maximum', 'minimum', 'optimal', 'complex', 'advanced']
    }
    
    for key, keywords in title_keywords.items():
        features[key] = int(any(kw in title_text for kw in keywords))
    
    features['has_optimization'] = int(any(w in text for w in 
        ['minimum', 'maximum', 'minimize', 'maximize', 'optimal', 'best', 
         'smallest', 'largest', 'least', 'most', 'shortest', 'longest']))
    
    features['count_minimum'] = text.count('minimum') + text.count('minimize')
    features['count_maximum'] = text.count('maximum') + text.count('maximize')
    features['count_optimal'] = text.count('optimal')
    features['count_shortest'] = text.count('shortest')
    features['count_longest'] = text.count('longest')
   
    numbers = re.findall(r'\d+', text)
    constraint_keys = ['max_constraint', 'min_constraint', 'avg_constraint', 'median_constraint',
                      'std_constraint', 'log_max_constraint', 'log_min_constraint',
                      'num_constraints', 'constraint_range', 'tiny_constraint',
                      'very_small_constraint', 'small_constraint', 'medium_small_constraint',
                      'medium_constraint', 'medium_large_constraint', 'large_constraint',
                      'very_large_constraint', 'huge_constraint', 'estimated_complexity',
                      'constraint_variance', 'has_large_and_small', 'num_large_constraints',
                      'num_small_constraints']
    
    if numbers:
        nums = [int(n) for n in numbers if len(n) <= 10]
        if nums:
            max_num = max(nums)
            min_num = min(nums)
            features['max_constraint'] = min(max_num, 1e9)
            features['min_constraint'] = min_num
            features['avg_constraint'] = np.mean(nums)
            features['median_constraint'] = np.median(nums)
            features['std_constraint'] = np.std(nums)
            features['log_max_constraint'] = np.log10(max_num + 1)
            features['log_min_constraint'] = np.log10(min_num + 1)
            features['num_constraints'] = len(nums)
            features['constraint_range'] = max_num - min_num
            
            features['tiny_constraint'] = int(max_num <= 10)
            features['very_small_constraint'] = int(10 < max_num <= 20)
            features['small_constraint'] = int(20 < max_num <= 100)
            features['medium_small_constraint'] = int(100 < max_num <= 1000)
            features['medium_constraint'] = int(1000 < max_num <= 10000)
            features['medium_large_constraint'] = int(10000 < max_num <= 100000)
            features['large_constraint'] = int(100000 < max_num <= 1000000)
            features['very_large_constraint'] = int(1000000 < max_num <= 10000000)
            features['huge_constraint'] = int(max_num > 10000000)

            if max_num <= 20:
                features['estimated_complexity'] = 1
            elif max_num <= 100:
                features['estimated_complexity'] = 2
            elif max_num <= 1000:
                features['estimated_complexity'] = 3
            elif max_num <= 100000:
                features['estimated_complexity'] = 4
            else:
                features['estimated_complexity'] = 5
            
            features['constraint_variance'] = np.var(nums)
            features['has_large_and_small'] = int(max_num > 10000 and min_num < 100)
            features['num_large_constraints'] = sum(1 for n in nums if n > 10000)
            features['num_small_constraints'] = sum(1 for n in nums if n < 100)
        else:
            for key in constraint_keys:
                features[key] = 0
    else:
        for key in constraint_keys:
            features[key] = 0

    features['modulo_count'] = text.count('modulo') + text.count(' mod ') + text.count('10^9+7')
    features['has_queries'] = int('quer' in text)
    features['query_count'] = text.count('query') + text.count('queries')
    features['testcase_count'] = text.count('test case') + text.count('testcase')
    features['formula_count'] = text.count('$')
    features['code_count'] = text.count('```') + text.count('code')

    sentences = [s for s in text.split('.') if s.strip()]
    features['num_sentences'] = len(sentences)
    features['avg_sentence_length'] = np.mean([len(s.split()) for s in sentences]) if sentences else 0
    features['max_sentence_length'] = max([len(s.split()) for s in sentences]) if sentences else 0

    if sentences:
        features['max_words_per_sentence'] = max(len(s.split()) for s in sentences)
        features['has_long_sentence'] = int(features['max_words_per_sentence'] > 30)
    else:
        features['max_words_per_sentence'] = 0
        features['has_long_sentence'] = 0

    features['has_nested_loop'] = int('nested' in text or 'n^2' in text or 'n 2' in text or 'n squared' in text)
    features['has_linear'] = int('linear' in text or 'o n' in text or 'single pass' in text)
    features['has_log'] = int('log' in text or 'logarithm' in text)
    features['has_exponential'] = int('exponential' in text or '2^n' in text or 'n!' in text)

    features['is_counting'] = int(any(w in text for w in ['how many', 'count', 'number of']))
    features['is_existence'] = int(any(w in text for w in ['is there', 'exists', 'possible', 'can you']))
    features['is_construction'] = int(any(w in text for w in ['construct', 'build', 'create', 'generate']))
    features['is_optimization'] = int(any(w in text for w in ['minimum', 'maximum', 'optimal', 'best']))

    features['is_interactive'] = int('interactive' in text or 'jury' in text)
    features['is_decision'] = int(('yes' in text and 'no' in text) or 
                                   'possible' in text or 'impossible' in text)
    
    features['has_multiple_queries'] = int(features['query_count'] > 1)
    features['has_updates'] = int('update' in text)
    features['has_range'] = int('range' in text)
    features['multi_test_cases'] = int('test case' in text and ('t' in text or 'multiple' in text))

    features['has_2d_constraint'] = int(bool(re.search(r'(\d+)\s*[xX×]\s*(\d+)', text)))
    features['has_multiple_arrays'] = text.count('array') + text.count('list')
    features['has_matrix_ops'] = int('transpose' in text or 'rotate' in text or 'flip' in text)
    features['has_special_output'] = int(any(w in text for w in 
        ['modulo', 'lexicographically', 'any valid', 'any order']))

    features['is_dp_keywords'] = int(any(w in text for w in 
        ['optimal', 'maximum', 'minimum', 'subproblem', 'overlapping']))
    features['is_graph_keywords'] = int(any(w in text for w in 
        ['connected', 'path', 'distance', 'reachable', 'component']))
    features['is_greedy_keywords'] = int(any(w in text for w in 
        ['local', 'choice', 'always', 'never']))

    title_words = title_text.split()
    features['title_has_number'] = int(bool(re.search(r'\d', title_text)))
    features['title_complexity'] = len([w for w in title_words if len(w) > 6])
    features['title_has_hard_words'] = int(any(w in title_text for w in 
        ['maximum', 'minimum', 'optimal', 'shortest', 'longest']))
    features['title_word_count'] = len(title_words)

    features['has_pattern'] = int('pattern' in text or 'sequence' in text or 'repeat' in text)
    features['has_boundary'] = int('boundary' in text or 'edge case' in text or 'corner case' in text)
    features['has_formula'] = int('formula' in text or 'equation' in text)
    features['has_array'] = int('array' in text)
    features['has_sequence'] = int('sequence' in text)
    features['mentions_complexity'] = int(any(w in text for w in 
        ['complexity', 'time limit', 'memory limit', 'efficient']))
    
    complexity_score = (
        features['has_dp'] * 5 +
        features['has_graph'] * 4 +
        features['has_advanced_ds'] * 6 +
        features['has_flow'] * 7 +
        features['has_shortest_path'] * 4 +
        features['has_number_theory'] * 3 +
        features['huge_constraint'] * 5 +
        features['has_string_algo'] * 3 +
        features['has_backtracking'] * 5 +
        features['has_bit_manipulation'] * 3 +
        features['has_geometry'] * 4 +
        features['has_divide_conquer'] * 3 +
        features['is_optimization'] * 2 +
        features['has_multiple_queries'] * 3 +
        features['is_interactive'] * 5 +
        (6 if features['estimated_complexity'] == 5 else 0)
    )
    features['algo_complexity_score'] = complexity_score

    features['length_complexity_ratio'] = features['total_length'] / (features['algo_complexity_score'] + 1)
    features['words_per_sentence'] = features['word_count'] / (features['num_sentences'] + 1)
    features['unique_per_length'] = features['unique_words'] / (features['total_length'] + 1)

    algo_density = (
        features['has_dp'] + features['has_graph'] + 
        features['has_greedy'] + features['has_binary_search']
    ) / (features['word_count'] / 100 + 1)
    features['algo_keyword_density'] = algo_density

    features['has_multiple_test'] = int(text.count('test') > 2)
    features['has_output_format'] = int('output format' in text or 'print' in text)
    features['has_input_format'] = int('input format' in text or 'read' in text)
    features['has_constraints_section'] = int('constraint' in text or 'limit' in text)
    
    return features

print("Extracting features...")
feature_dicts = df.apply(extract_features, axis=1)
numerical_features_df = pd.DataFrame(list(feature_dicts))

print(f"Extracted {len(numerical_features_df.columns)} numerical features")

Extracting features...
Extracted 143 numerical features


In [5]:
df["full_text"] = (
    df["title"] + " " + df["title"] + " " +  
    df["description"] + " " +
    df["input_description"] + " " +
    df["output_description"]
)
df["full_text"] = df["full_text"].str.lower()
df["full_text"] = df["full_text"].apply(lambda x: re.sub(r"[^a-z0-9 ]", " ", x))
df["full_text"] = df["full_text"].apply(lambda x: " ".join(x.split()))

print("Text preprocessing complete")

Text preprocessing complete


In [6]:
X_text = df["full_text"]
X_numerical = numerical_features_df
y_class = df["problem_class"]
y_score = df["problem_score"]

label_encoder = LabelEncoder()
y_class_encoded = label_encoder.fit_transform(y_class)

X_text_train, X_text_test, X_num_train, X_num_test, y_class_train, y_class_test, y_class_train_encoded, y_class_test_encoded, y_score_train, y_score_test = train_test_split(
    X_text, X_numerical, y_class, y_class_encoded, y_score,
    test_size=0.2,
    random_state=42,
    stratify=y_class
)

print(f"Train size: {len(X_text_train)}, Test size: {len(X_text_test)}")

Train size: 3289, Test size: 823


In [7]:
custom_stop_words = [
    'given', 'find', 'output', 'input', 'first', 'second', 
    'line', 'integer', 'number', 'single', 'example', 'contain',
    'follow', 'next', 'note', 'sample'
]

print("Creating TF-IDF vectorizers...")
tfidf_word = TfidfVectorizer(
    max_features=15000, 
    ngram_range=(1, 4),
    min_df=2, 
    max_df=0.7, 
    stop_words=list(set(list(ENGLISH_STOP_WORDS) + custom_stop_words)),
    sublinear_tf=True,
    use_idf=True,
    norm='l2',
    analyzer='word'
)

tfidf_char = TfidfVectorizer(
    max_features=3000,
    ngram_range=(3, 6), 
    analyzer='char',
    sublinear_tf=True,
    min_df=2 
)

print("Transforming text features...")
X_text_train_word = tfidf_word.fit_transform(X_text_train)
X_text_test_word = tfidf_word.transform(X_text_test)

X_text_train_char = tfidf_char.fit_transform(X_text_train)
X_text_test_char = tfidf_char.transform(X_text_test)

X_text_train_tfidf = hstack([X_text_train_word, X_text_train_char])
X_text_test_tfidf = hstack([X_text_test_word, X_text_test_char])

print("Scaling numerical features...")
scaler = StandardScaler()
X_num_train_scaled = scaler.fit_transform(X_num_train)
X_num_test_scaled = scaler.transform(X_num_test)

X_num_train_weighted = X_num_train_scaled * 5.0 
X_num_test_weighted = X_num_test_scaled * 5.0

print("Combining all features...")
X_train_combined = hstack([X_text_train_tfidf, csr_matrix(X_num_train_weighted)])
X_test_combined = hstack([X_text_test_tfidf, csr_matrix(X_num_test_weighted)])

print(f"Final feature shape: {X_train_combined.shape}")
print("\n" + "="*80)
print("PART 1 COMPLETE - Ready for model training!")
print("="*80)

Creating TF-IDF vectorizers...
Transforming text features...
Scaling numerical features...
Combining all features...
Final feature shape: (3289, 18143)

PART 1 COMPLETE - Ready for model training!


In [8]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, RandomForestRegressor, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_absolute_error, mean_squared_error, f1_score
from xgboost import XGBClassifier  
import joblib
import time
import numpy as np
import pandas as pd

print("="*80)
print("STARTING MODEL TRAINING")
print("="*80)

STARTING MODEL TRAINING


In [9]:
print(f"\nX_train_combined type: {type(X_train_combined)}")
print(f"X_train_combined shape: {X_train_combined.shape}")

print("\nTraining classification models sequentially...")
print(f"Dataset size: {X_train_combined.shape[0]} training samples")
print("="*80)

models = {}

print("\n[1/4] Training Extra Trees...")
start = time.time()
et_clf = ExtraTreesClassifier(
    n_estimators=400,
    max_depth=25,
    min_samples_split=3,
    min_samples_leaf=2,
    max_features=0.3,
    class_weight='balanced_subsample',
    bootstrap=True,
    n_jobs=-1,
    random_state=42,
    criterion='entropy'
)
et_clf.fit(X_train_combined, y_class_train)
et_pred = et_clf.predict(X_test_combined)
et_acc = accuracy_score(y_class_test, et_pred)
et_f1 = f1_score(y_class_test, et_pred, average='macro')
models['Extra Trees'] = (et_clf, et_acc, et_f1)
elapsed = time.time() - start
print(f"Extra Trees: Accuracy={et_acc:.4f}, F1={et_f1:.4f} (Time: {elapsed:.1f}s)")

print("\n[2/4] Training Random Forest...")
start = time.time()
rf_clf = RandomForestClassifier(
    n_estimators=400,
    max_depth=25,
    min_samples_split=3,
    min_samples_leaf=2,
    max_features=0.3,
    class_weight='balanced_subsample',
    n_jobs=-1,
    random_state=42,
    criterion='entropy',
    max_samples=0.85
)
rf_clf.fit(X_train_combined, y_class_train)
rf_pred = rf_clf.predict(X_test_combined)
rf_acc = accuracy_score(y_class_test, rf_pred)
rf_f1 = f1_score(y_class_test, rf_pred, average='macro')
models['Random Forest'] = (rf_clf, rf_acc, rf_f1)
elapsed = time.time() - start
print(f"Random Forest: Accuracy={rf_acc:.4f}, F1={rf_f1:.4f} (Time: {elapsed:.1f}s)")

print("\n[3/4] Training XGBoost...")
start = time.time()
xgb_clf = XGBClassifier(
    n_estimators=500,
    max_depth=12,
    learning_rate=0.05,
    subsample=0.85,
    colsample_bytree=0.85,
    colsample_bylevel=0.85,
    reg_alpha=0.5,
    reg_lambda=3.0,
    min_child_weight=2,
    gamma=0.2,
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss',
    tree_method='hist',
    scale_pos_weight=1.5,
    verbosity=0
)
xgb_clf.fit(X_train_combined, y_class_train_encoded, verbose=False)
xgb_pred_encoded = xgb_clf.predict(X_test_combined)
xgb_pred = label_encoder.inverse_transform(xgb_pred_encoded)
xgb_acc = accuracy_score(y_class_test, xgb_pred)
xgb_f1 = f1_score(y_class_test, xgb_pred, average='macro')
models['XGBoost'] = (xgb_clf, xgb_acc, xgb_f1)
elapsed = time.time() - start
print(f"XGBoost: Accuracy={xgb_acc:.4f}, F1={xgb_f1:.4f} (Time: {elapsed:.1f}s)")

print("\n[4/4] Training Logistic Regression...")
start = time.time()
lr_clf = LogisticRegression(
    max_iter=2000,
    C=2.0,
    class_weight='balanced',
    solver='saga',
    penalty='l2',
    n_jobs=-1,
    random_state=42,
    tol=1e-3,
    verbose=0
)
lr_clf.fit(X_train_combined, y_class_train)
lr_pred = lr_clf.predict(X_test_combined)
lr_acc = accuracy_score(y_class_test, lr_pred)
lr_f1 = f1_score(y_class_test, lr_pred, average='macro')
models['Logistic Regression'] = (lr_clf, lr_acc, lr_f1)
elapsed = time.time() - start
print(f"Logistic Regression: Accuracy={lr_acc:.4f}, F1={lr_f1:.4f} (Time: {elapsed:.1f}s)")

print("\n[5/5] Creating Fast Ensemble...")
start = time.time()

et_proba = et_clf.predict_proba(X_test_combined)
rf_proba = rf_clf.predict_proba(X_test_combined)
xgb_proba_encoded = xgb_clf.predict_proba(X_test_combined)

label_mapping = {label_encoder.transform([cls])[0]: i for i, cls in enumerate(et_clf.classes_)}
xgb_proba = np.zeros_like(et_proba)
for enc_idx, orig_idx in label_mapping.items():
    xgb_proba[:, orig_idx] = xgb_proba_encoded[:, enc_idx]

ensemble_proba = (et_proba + rf_proba + xgb_proba) / 3
y_class_pred = et_clf.classes_[np.argmax(ensemble_proba, axis=1)]

ensemble_acc = accuracy_score(y_class_test, y_class_pred)
ensemble_f1 = f1_score(y_class_test, y_class_pred, average='macro')
elapsed = time.time() - start

print(f"Fast Ensemble: Accuracy={ensemble_acc:.4f}, F1={ensemble_f1:.4f} (Time: {elapsed:.1f}s)")
models['Fast Ensemble'] = (None, ensemble_acc, ensemble_f1)

best_model_name = max(models.items(), key=lambda x: x[1][1])[0]
best_clf, best_acc, best_f1 = models[best_model_name]

if best_model_name == 'Fast Ensemble':
    best_clf = xgb_clf
    print(f"\n✓ Ensemble won! Saving XGBoost for deployment.")

ensemble_clf = best_clf

print("\n" + "="*80)
print("CLASSIFICATION TRAINING COMPLETE")
print("="*80)
print(f"Best Model: {best_model_name}")
print(f"Accuracy: {best_acc:.4f} ({best_acc*100:.2f}%)")
print(f"Macro F1: {best_f1:.4f}")

print("\nAll Models Performance:")
for name, (_, acc, f1) in sorted(models.items(), key=lambda x: x[1][1], reverse=True):
    print(f"  {name:20s}: Accuracy={acc:.4f} ({acc*100:.2f}%), F1={f1:.4f}")

print("\n" + "="*80)
print("CLASSIFICATION REPORT")
print("="*80)
print(classification_report(y_class_test, y_class_pred))

print("\nCONFUSION MATRIX")
print("="*80)
labels = sorted(df['problem_class'].unique())
cm = pd.DataFrame(
    confusion_matrix(y_class_test, y_class_pred, labels=labels),
    index=[f"Actual_{c}" for c in labels],
    columns=[f"Pred_{c}" for c in labels]
)
print(cm)


X_train_combined type: <class 'scipy.sparse._csr.csr_matrix'>
X_train_combined shape: (3289, 18143)

Training classification models sequentially...
Dataset size: 3289 training samples

[1/4] Training Extra Trees...
Extra Trees: Accuracy=0.5395, F1=0.4465 (Time: 136.9s)

[2/4] Training Random Forest...
Random Forest: Accuracy=0.5249, F1=0.4212 (Time: 154.1s)

[3/4] Training XGBoost...
XGBoost: Accuracy=0.5395, F1=0.4662 (Time: 1132.9s)

[4/4] Training Logistic Regression...
Logistic Regression: Accuracy=0.4520, F1=0.4356 (Time: 54.0s)

[5/5] Creating Fast Ensemble...
Fast Ensemble: Accuracy=0.5334, F1=0.4380 (Time: 0.5s)

CLASSIFICATION TRAINING COMPLETE
Best Model: Extra Trees
Accuracy: 0.5395 (53.95%)
Macro F1: 0.4465

All Models Performance:
  Extra Trees         : Accuracy=0.5395 (53.95%), F1=0.4465
  XGBoost             : Accuracy=0.5395 (53.95%), F1=0.4662
  Fast Ensemble       : Accuracy=0.5334 (53.34%), F1=0.4380
  Random Forest       : Accuracy=0.5249 (52.49%), F1=0.4212
  Log

In [12]:
print("\n" + "="*80)
print("TRAINING SCORE REGRESSOR")
print("="*80)

regressor = RandomForestRegressor(
    n_estimators=500,
    max_depth=30,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    n_jobs=-1,
    random_state=42
)

print("Training regressor...")
regressor.fit(X_train_combined, y_score_train)
y_score_pred = regressor.predict(X_test_combined)

mae = mean_absolute_error(y_score_test, y_score_pred)
rmse = np.sqrt(mean_squared_error(y_score_test, y_score_pred))
correlation = np.corrcoef(y_score_test, y_score_pred)[0, 1] if np.std(y_score_test) > 0 else 0.0

print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"Correlation: {correlation:.4f}")


TRAINING SCORE REGRESSOR
Training regressor...
MAE: 1.7240
RMSE: 2.0434
Correlation: 0.4116


In [13]:
print("\n" + "="*80)
print("SAVING MODELS")
print("="*80)

joblib.dump(tfidf_word, "../models/text_vectorizer_word.pkl")
joblib.dump(tfidf_char, "../models/text_vectorizer_char.pkl")
joblib.dump(scaler, "../models/numerical_scaler.pkl")
joblib.dump(label_encoder, "../models/label_encoder.pkl")
joblib.dump(ensemble_clf, "../models/difficulty_classifier.pkl")
joblib.dump(regressor, "../models/score_regressor.pkl")

metadata = {
    "classifier_type": best_model_name,
    "classifier_accuracy": float(best_acc),
    "classifier_f1": float(best_f1),
    "regressor_mae": float(mae),
    "regressor_rmse": float(rmse),
    "numerical_features": list(X_numerical.columns),
    "n_text_features": X_text_train_tfidf.shape[1],
    "n_total_features": X_train_combined.shape[1],
    "all_models_performance": {name: {"accuracy": float(acc), "f1": float(f1)} 
                               for name, (_, acc, f1) in models.items()}
}
joblib.dump(metadata, "../models/metadata.pkl")

print("All models saved successfully!")
print("\n" + "="*80)
print("TRAINING COMPLETE!")
print("="*80)
print(f"Final Classification Accuracy: {best_acc*100:.2f}%")
print(f"Final Classification F1: {best_f1:.4f}")
print(f"Regression MAE: {mae:.4f}")
print("\nModels saved in '../models/' directory")
print("Files created:")
print("  - text_vectorizer_word.pkl")
print("  - text_vectorizer_char.pkl")
print("  - numerical_scaler.pkl")
print("  - label_encoder.pkl")
print("  - difficulty_classifier.pkl")
print("  - score_regressor.pkl")
print("  - metadata.pkl")


SAVING MODELS
All models saved successfully!

TRAINING COMPLETE!
Final Classification Accuracy: 53.95%
Final Classification F1: 0.4465
Regression MAE: 1.7240

Models saved in '../models/' directory
Files created:
  - text_vectorizer_word.pkl
  - text_vectorizer_char.pkl
  - numerical_scaler.pkl
  - label_encoder.pkl
  - difficulty_classifier.pkl
  - score_regressor.pkl
  - metadata.pkl
