In [1]:
# Constants
TRAIN_DATA_PATH = '../data/train_data.csv'
TRAIN_LABELS_PATH = '../data/train_labels.csv'
TEST_DATA_PATH = '../data/test_data.csv'
EXPORT_PATH = '../data/processed/1_preprocessed_data.pkl'

RANDOM_STATE = 42
N_FEATURES_SELECT = 2000

In [2]:
# Load packages
import pandas as pd
import numpy as np
import pickle
import logging
from pathlib import Path

from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

pd.options.display.max_rows = 10000

_________

## Functions

In [3]:
def preprocess_features(X_train, X_test, n_features_select=2000):
    """
    Preprocess features: variance threshold, scaling, feature selection
    
    Parameters:
    * X_train (np.array): Training features
    * X_test (np.array): Test features
    * n_features_select (int): Number of features to select
    
    Returns: tuple of (X_train_processed, X_test_processed, feature_selector, scaler)
    """
    
    # Variance threshold
    variance_selector = VarianceThreshold(threshold=0.01)
    X_train_var = variance_selector.fit_transform(X_train)
    X_test_var = variance_selector.transform(X_test)
    
    # Scaling
    scaler = RobustScaler()
    X_train_scaled = scaler.fit_transform(X_train_var)
    X_test_scaled = scaler.transform(X_test_var)
    
    # Feature selection
    n_features_to_select = min(n_features_select, X_train_scaled.shape[1])
    feature_selector = SelectKBest(score_func=f_classif, k=n_features_to_select)
    X_train_selected = feature_selector.fit_transform(X_train_scaled, y_train)
    X_test_selected = feature_selector.transform(X_test_scaled)
    
    return X_train_selected, X_test_selected, feature_selector, scaler, variance_selector

In [4]:
def handle_class_imbalance(X_train, y_train, random_state=42):
    """
    Handle class imbalance using SMOTE
    
    Parameters:
    * X_train (np.array): Training features
    * y_train (np.array): Training labels
    * random_state (int): Random state for reproducibility
    
    Returns: tuple of (X_train_balanced, y_train_balanced, smote)
    """
    
    smote = SMOTE(random_state=random_state, k_neighbors=3)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)
    
    return X_train_balanced, y_train_balanced, smote

_________

## Read data

In [5]:
# Read data
train_data = pd.read_csv(TRAIN_DATA_PATH, header=None)
train_labels = pd.read_csv(TRAIN_LABELS_PATH, header=None)
test_data = pd.read_csv(TEST_DATA_PATH, header=None)

In [6]:
# Prepare data
X_train = train_data.values
y_train = train_labels.values.ravel()
X_test = test_data.values

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (3750, 10000)
y_train shape: (3750,)
X_test shape: (1250, 10000)


_________

## Feature preprocessing

In [7]:
# Preprocess features
X_train_processed, X_test_processed, feature_selector, scaler, variance_selector = \
    preprocess_features(X_train, X_test, N_FEATURES_SELECT)

print(f"Original features: {X_train.shape[1]}")
print(f"After variance threshold: {X_train.shape[1] - variance_selector.get_support().sum()}")
print(f"After feature selection: {X_train_processed.shape[1]}")
print(f"Feature reduction: {((X_train.shape[1] - X_train_processed.shape[1]) / X_train.shape[1] * 100):.1f}%")

Original features: 10000
After variance threshold: 0
After feature selection: 2000
Feature reduction: 80.0%


_________

## Class imbalance handling

In [8]:
# Analyze class distribution
unique_labels = np.unique(y_train)
class_counts = np.array([np.sum(y_train == -1), np.sum(y_train == 1)])
imbalance_ratio = class_counts[1] / class_counts[0]

print(f"Class -1 count: {class_counts[0]}")
print(f"Class +1 count: {class_counts[1]}")
print(f"Imbalance ratio: {imbalance_ratio:.2f}:1")
print(f"Minority class percentage: {class_counts[0] / len(y_train) * 100:.1f}%")

Class -1 count: 373
Class +1 count: 3377
Imbalance ratio: 9.05:1
Minority class percentage: 9.9%


In [9]:
# Apply SMOTE
X_train_balanced, y_train_balanced, smote = handle_class_imbalance(X_train_processed, y_train, RANDOM_STATE)

print(f"Original training set: {X_train_processed.shape[0]} samples")
print(f"SMOTE-balanced training set: {X_train_balanced.shape[0]} samples")
print(f"Class distribution after SMOTE:")
print(f"  Class -1: {np.sum(y_train_balanced == -1)} samples")
print(f"  Class +1: {np.sum(y_train_balanced == 1)} samples")

Original training set: 3750 samples
SMOTE-balanced training set: 6754 samples
Class distribution after SMOTE:
  Class -1: 3377 samples
  Class +1: 3377 samples


In [13]:
# Label encoding for XGBoost
xgb_label_encoder = LabelEncoder()
y_train_transformed = xgb_label_encoder.fit_transform(y_train)
y_train_balanced_transformed = xgb_label_encoder.transform(y_train_balanced)

print(f"Original labels: {unique_labels}")
print(f"Transformed labels: {xgb_label_encoder.classes_}")
print(f"Sample of transformed y_train: {y_train_transformed[:10]}")
print(f"Unique values in transformed: {np.unique(y_train_transformed)}")
print(f"Mapping: {dict(zip([int(x) for x in xgb_label_encoder.classes_], range(len(xgb_label_encoder.classes_))))}")

Original labels: [-1  1]
Transformed labels: [-1  1]
Sample of transformed y_train: [1 1 1 1 1 0 1 1 1 1]
Unique values in transformed: [0 1]
Mapping: {-1: 0, 1: 1}


In [25]:
# Class weights
class_weights = compute_class_weight('balanced', classes=unique_labels, y=y_train)
class_weight_dict = dict(zip(unique_labels, class_weights))
scale_pos_weight = class_counts[1] / class_counts[0]

class_weight_dict = {int(k): float(v) for k, v in class_weight_dict.items()}
print(f"Class weights: {class_weight_dict}")
print(f"XGBoost scale_pos_weight: {scale_pos_weight:.2f}")

Class weights: {-1: 5.02680965147453, 1: 0.5552265324252295}
XGBoost scale_pos_weight: 9.05


_________

## Save preprocessed data

In [27]:
# Create processed data dictionary
processed_data = {
    'X_train_original': X_train_processed,
    'y_train_original': y_train,
    'X_train_balanced': X_train_balanced,
    'y_train_balanced': y_train_balanced,
    'y_train_transformed': y_train_transformed,
    'y_train_balanced_transformed': y_train_balanced_transformed,
    'X_test': X_test_processed,
    'feature_selector': feature_selector,
    'scaler': scaler,
    'variance_selector': variance_selector,
    'xgb_label_encoder': xgb_label_encoder,
    'class_weight_dict': class_weight_dict,
    'scale_pos_weight': scale_pos_weight,
    'smote': smote
}

# Save to pickle
import os

# Ensure the processed data directory exists
export_dir = os.path.dirname(EXPORT_PATH)
if not os.path.exists(export_dir):
    os.makedirs(export_dir, exist_ok=True)

with open(EXPORT_PATH, 'wb') as f:
    pickle.dump(processed_data, f)

print(f"Preprocessed data saved to {EXPORT_PATH}")

Preprocessed data saved to ../data/processed/1_preprocessed_data.pkl
