In [5]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import StackingClassifier, VotingClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier


In [2]:
# Load data
data_train = np.load('data_train.npy', allow_pickle=True)
data_test = np.load('data_test.npy', allow_pickle=True)
vocab_map = np.load('vocab_map.npy', allow_pickle=True)

# Load labels_train from CSV and extract the 'label' column
labels_train_df = pd.read_csv('label_train.csv')
labels_train = labels_train_df['label'].values

# Convert training data to DataFrame and assign column names
df_train = pd.DataFrame(data_train, columns=vocab_map)
df_train['TARGETT'] = labels_train

# Separate features and target variable
X = df_train.drop(columns=['TARGETT'])
y = df_train['TARGETT']

# Apply TF-IDF Transformation to the Term Count Matrix
print("Applying TF-IDF Transformation...")
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X)
print("Transformation applied!")

Applying TF-IDF Transformation...
Transformation applied!


In [10]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

# Define F1 scorer
f1_scorer = make_scorer(f1_score, average='macro')

# Dimensionality reduction configurations for sparse data
dim_reduction_configs = {
    'SVD_300': TruncatedSVD(n_components=300, random_state=42),
    'SVD_1000': TruncatedSVD(n_components=1000, random_state=42),  # Memory efficiency
}

# Define sparse-friendly models with regularization
# - Logistic Regression with Elastic Net regularization
# - Linear SVM with L2 regularization
# - SGDClassifier with Elastic Net regularization
# Why? These performed the best in the previous Model Selection notebook
standard_models = {
    'LogisticRegression': make_pipeline(
        StandardScaler(with_mean=False),  # StandardScaler for sparse data
        LogisticRegression(solver='saga', max_iter=5000, random_state=42, penalty='elasticnet', l1_ratio=0.5, n_jobs=-1)
    ),
    'SGDClassifier': make_pipeline(
        StandardScaler(with_mean=False),
        SGDClassifier(loss='hinge', max_iter=5000, tol=1e-3, penalty='elasticnet', l1_ratio=0.5, random_state=42, n_jobs=-1)
    ),
    'LinearSVC': make_pipeline(
        StandardScaler(with_mean=False),
        LinearSVC(max_iter=5000, tol=1e-5, penalty='l2', dual=False, random_state=42)
    )
}

# Stacking model with sparse-friendly classifiers
# Why stacking? It combines the strengths of multiple models to improve performance
stacking_model = StackingClassifier(
    estimators=[
        ('logreg', make_pipeline(StandardScaler(with_mean=False), LogisticRegression(solver='saga', max_iter=5000, penalty='elasticnet', l1_ratio=0.5, random_state=42))),
        ('sgd', make_pipeline(StandardScaler(with_mean=False), SGDClassifier(loss='hinge', max_iter=5000, tol=1e-3, penalty='elasticnet', l1_ratio=0.5, random_state=42))),
        ('svc', make_pipeline(StandardScaler(with_mean=False), LinearSVC(max_iter=5000, tol=1e-5, penalty='l2', dual=False, random_state=42))),
        ('lgbm', LGBMClassifier(n_estimators=100, learning_rate=0.1, num_leaves=15, max_depth=6, random_state=42))
    ],
    final_estimator=LogisticRegression(), cv=5, n_jobs=-1
)

# Voting classifier, applying StandardScaler where necessary
# Why voting? It combines the predictions of multiple models to improve performances
voting_model = VotingClassifier(
    estimators=[
        ('logreg', make_pipeline(StandardScaler(with_mean=False), LogisticRegression(max_iter=1000, random_state=42))),
         ('sgd', make_pipeline(StandardScaler(with_mean=False), SGDClassifier(loss='hinge', max_iter=5000, tol=1e-3, penalty='elasticnet', l1_ratio=0.5, random_state=42))),
        ('svc', SVC(kernel="linear", C=1.0, probability=True, max_iter=20000, random_state=42)),  # StandardScaler not required here
        ('lgbm', LGBMClassifier(n_estimators=100, learning_rate=0.1, num_leaves=15, max_depth=6, random_state=42))
    ],
    voting='soft', n_jobs=-1
)

# Combine all models
models = {**standard_models, 'Stacking': stacking_model, 'Voting': voting_model}

# Results list to store performance
results = []

# Apply Chi-Squared feature selection first, then dimensionality reduction
for feat_name, feature_select in {'ChiSquare_1000': SelectKBest(chi2, k=1000), 'ChiSquare_1500': SelectKBest(chi2, k=1500)}.items():
    print(f"Applying Chi-Squared feature selection: {feat_name}")
    X_chi2 = feature_select.fit_transform(X_tfidf, y)

    for dim_name, dim_reduction in dim_reduction_configs.items():
        X_reduced = dim_reduction.fit_transform(X_chi2)

        for model_name, model in models.items():
            # Perform cross-validation directly on model pipeline
            cv_score = cross_val_score(model, X_reduced, y, scoring=f1_scorer,
                                       cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)).mean()
            results.append({
                'Dimensionality Reduction': dim_name,
                'Feature Selection': feat_name,
                'Model': model_name,
                'Best C': None,
                'F1 Score': cv_score
            })
            print(f"Completed: DimReduction={dim_name}, FeatSelection={feat_name}, Model={model_name}, F1 Score={cv_score:.4f}")

# Save results to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('model_evaluation_results.csv', index=False)
print("All evaluations completed. Results saved to model_evaluation_results.csv")

Applying Chi-Squared feature selection: ChiSquare_1000
Completed: DimReduction=SVD_300, FeatSelection=ChiSquare_1000, Model=LogisticRegression, F1 Score=0.6966
Completed: DimReduction=SVD_300, FeatSelection=ChiSquare_1000, Model=SGDClassifier, F1 Score=0.6781
Completed: DimReduction=SVD_300, FeatSelection=ChiSquare_1000, Model=LinearSVC, F1 Score=0.6932
[LightGBM] [Info] Number of positive: 1470, number of negative: 4559
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Number of positive: 1470, number of negative: 4559
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064419 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.058803 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM



[LightGBM] [Info] Number of positive: 1472, number of negative: 4559
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.025792 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 6031, number of used features: 300
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.244072 -> initscore=-1.130481
[LightGBM] [Info] Start training from score -1.130481
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.050319 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067873 seconds.



Completed: DimReduction=SVD_1000, FeatSelection=ChiSquare_1000, Model=LinearSVC, F1 Score=0.7044




[LightGBM] [Info] Number of positive: 1838, number of negative: 5699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.094169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of data points in the train set: 7537, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243864 -> initscore=-1.131613
[LightGBM] [Info] Start training from score -1.131613




[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Number of positive: 1470, number of negative: 4559
[LightGBM] [Info] Number of positive: 1470, number of negative: 4559
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.198531 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of data points in the train set: 6030, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243947 -> initscore=-1.131161
[LightGBM] [Info] Start training from score -1.131161
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.255791 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.207063 secon



[LightGBM] [Info] Number of positive: 1838, number of negative: 5699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.107389 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of data points in the train set: 7537, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243864 -> initscore=-1.131613
[LightGBM] [Info] Start training from score -1.131613




[LightGBM] [Info] Number of positive: 1470, number of negative: 4559
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.231315 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.254088 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Number of data points in the train set: 6029, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243822 -> initscore=-1.131841
[LightGBM] [Info] Start training from score -1.131841
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of positive: 1470, number of negative: 4559
[LightGBM] [Info] Number of data points in the train set: 6030, number of used features: 1000
[Light



[LightGBM] [Info] Number of positive: 1839, number of negative: 5699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.139598 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of data points in the train set: 7538, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243964 -> initscore=-1.131069
[LightGBM] [Info] Start training from score -1.131069




[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Number of positive: 1472, number of negative: 4559
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.233577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of data points in the train set: 6030, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243947 -> initscore=-1.131161
[LightGBM] [Info] Start training from score -1.131161
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.278238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.267340 secon



[LightGBM] [Info] Number of positive: 1839, number of negative: 5699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.112499 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of data points in the train set: 7538, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243964 -> initscore=-1.131069
[LightGBM] [Info] Start training from score -1.131069




[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.226004 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of data points in the train set: 6030, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243947 -> initscore=-1.131161
[LightGBM] [Info] Start training from score -1.131161
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Number of positive: 1472, number of negative: 4559
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.265457 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.245744 seconds.
You can set `force_col_wise=true



[LightGBM] [Info] Number of positive: 1838, number of negative: 5700
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.139552 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of data points in the train set: 7538, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243831 -> initscore=-1.131788
[LightGBM] [Info] Start training from score -1.131788




[LightGBM] [Info] Number of positive: 1470, number of negative: 4560
[LightGBM] [Info] Number of positive: 1470, number of negative: 4560
[LightGBM] [Info] Number of positive: 1471, number of negative: 4560
[LightGBM] [Info] Number of positive: 1470, number of negative: 4560
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.235147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of data points in the train set: 6030, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243781 -> initscore=-1.132060
[LightGBM] [Info] Start training from score -1.132060
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.233849 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.251691 seconds.
You can set `force_col_wise=true



Completed: DimReduction=SVD_1000, FeatSelection=ChiSquare_1000, Model=Stacking, F1 Score=0.7372
Completed: DimReduction=SVD_1000, FeatSelection=ChiSquare_1000, Model=Voting, F1 Score=0.7185
Applying Chi-Squared feature selection: ChiSquare_1500
Completed: DimReduction=SVD_300, FeatSelection=ChiSquare_1500, Model=LogisticRegression, F1 Score=0.6925
Completed: DimReduction=SVD_300, FeatSelection=ChiSquare_1500, Model=SGDClassifier, F1 Score=0.6580
Completed: DimReduction=SVD_300, FeatSelection=ChiSquare_1500, Model=LinearSVC, F1 Score=0.6902
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Number of positive: 1470, number of negative: 4559
[LightGBM] [Info] Number of positive: 1471, number of negative: 4559
[LightGBM] [Info] Number of positive: 1470, number of negative: 4559
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030923 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is

Completed: DimReduction=SVD_1000, FeatSelection=ChiSquare_1500, Model=Stacking, F1 Score=0.7438
Completed: DimReduction=SVD_1000, FeatSelection=ChiSquare_1500, Model=Voting, F1 Score=0.7260

In [11]:
# Load the best model configuration
best_model_config = results_df.loc[results_df['F1 Score'].idxmax()]
best_dim_reduction = best_model_config['Dimensionality Reduction']
best_feat_selection = best_model_config['Feature Selection']
best_model_name = best_model_config['Model']

# Apply the best model configuration to the full training data
X_best = feature_select.transform(tfidf_transformer.transform(X))
X_best = dim_reduction_configs[best_dim_reduction].transform(X_best)
best_model = models[best_model_name]
best_model.fit(X_best, y)

# Make predictions on the test data
X_test = feature_select.transform(tfidf_transformer.transform(data_test))
X_test = dim_reduction_configs[best_dim_reduction].transform(X_test)
test_predictions = best_model.predict(X_test)

[LightGBM] [Info] Number of positive: 1838, number of negative: 5699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.260472 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Number of data points in the train set: 7537, number of used features: 1000
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.243864 -> initscore=-1.131613
[LightGBM] [Info] Start training from score -1.131613
[LightGBM] [Info] Number of positive: 1838, number of negative: 5699
[LightGBM] [Info] Number of positive: 1839, number of negative: 5699
[LightGBM] [Info] Number of positive: 1839, number of negative: 5699
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.253975 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.266386 secon



Test predictions saved to test_predictions.csv


In [13]:
# Prepare the data for saving
i = np.arange(len(test_predictions))
output = np.column_stack((i, test_predictions))

# Save predictions to a CSV file
header = 'Id,Label'
np.savetxt('test_predictions.csv', output, delimiter=',', header=header, comments='', fmt='%d')
print("\nPredictions saved to test_predictions.csv")


Predictions saved to test_predictions.csv
