In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

def load_data():
    """Load the competition data"""
    train_df = pd.read_csv(r"fda_trainingset.csv")
    test_df = pd.read_csv(r"fda_testset.csv")
    return train_df, test_df

def enhanced_feature_engineering(df, is_train=True, numeric_cols=None, top_features=None):
    """Add advanced feature engineering"""
    if is_train:
        numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
        if 'ID' in numeric_cols:
            numeric_cols.remove('ID')
        if 'Y' in numeric_cols:
            numeric_cols.remove('Y')

    # Interaction features
    for i, col1 in enumerate(numeric_cols[:10]):
        for col2 in numeric_cols[i+1:i+5]:
            df[f'{col1}_x_{col2}'] = df[col1] * df[col2]
            df[f'{col1}_div_{col2}'] = df[col1] / (df[col2] + 1e-6)

    # Statistical features
    df['row_sum'] = df[numeric_cols].sum(axis=1)
    df['row_mean'] = df[numeric_cols].mean(axis=1)
    df['row_std'] = df[numeric_cols].std(axis=1)

    # Polynomial features for top 5 numeric columns
    if len(numeric_cols) >= 5:
        poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
        imputer = SimpleImputer(strategy='median')
        df[numeric_cols[:5]] = imputer.fit_transform(df[numeric_cols[:5]])
        poly_features = poly.fit_transform(df[numeric_cols[:5]])
        poly_feature_names = poly.get_feature_names_out(numeric_cols[:5])
        poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df.index)
        df = pd.concat([df, poly_df], axis=1)

    # Feature importance-based interactions (if top_features provided)
    if top_features and len(top_features) >= 2:
        for i, col1 in enumerate(top_features[:2]):
            for col2 in top_features[i+1:i+3]:
                df[f'imp_{col1}_x_{col2}'] = df[col1] * df[col2]

    return df, numeric_cols

def preprocess_data(train_df, test_df):
    """Preprocess data including encoding and feature engineering"""
    X = train_df.drop(columns=['ID', 'Y'])
    y = train_df['Y']
    test_ids = test_df['ID']
    X_test = test_df.drop(columns=['ID'])

    # Convert to float32 to save memory
    X = X.astype(np.float32)
    X_test = X_test.astype(np.float32)

    # Encode categorical variables
    label_encoders = {}
    for column in X.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column].astype(str))
        X_test[column] = le.transform(X_test[column].astype(str))
        label_encoders[column] = le

    # Initial feature engineering to get feature importance
    temp_pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), XGBClassifier(random_state=42))
    temp_pipeline.fit(X, y)
    feature_importance = pd.Series(temp_pipeline.named_steps['xgbclassifier'].feature_importances_, index=X.columns)
    top_features = feature_importance.nlargest(5).index.tolist()

    # Full feature engineering
    X, numeric_cols = enhanced_feature_engineering(X, is_train=True, top_features=top_features)
    X_test, _ = enhanced_feature_engineering(X_test, is_train=False, numeric_cols=numeric_cols, top_features=top_features)

    return X, y, X_test, test_ids, top_features

def build_ensemble_model():
    """Create a simplified ensemble of XGBoost and LightGBM"""
    xgb = XGBClassifier(
        n_estimators=350,
        learning_rate=0.02,
        max_depth=4,
        subsample=0.85,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss',
        scale_pos_weight=1.2  # Adjusted for class imbalance
    )
    lgbm = LGBMClassifier(
        n_estimators=350,
        learning_rate=0.02,
        max_depth=4,
        subsample=0.85,
        colsample_bytree=0.8,
        random_state=42,
        verbose=-1,
        scale_pos_weight=1.2
    )
    return VotingClassifier(
        estimators=[('xgb', xgb), ('lgbm', lgbm)],
        voting='soft',
        weights=[0.6, 0.4]  # Favor XGBoost slightly based on performance
    )

def optimize_threshold(y_true, y_prob):
    """Optimize classification threshold for maximum accuracy"""
    thresholds = np.arange(0.1, 0.9, 0.01)
    best_threshold = 0.5
    best_score = 0
    for threshold in thresholds:
        y_pred = (y_prob >= threshold).astype(int)
        score = accuracy_score(y_true, y_pred)
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold

def main():
    # Load data
    train_df, test_df = load_data()

    # Preprocess data
    X, y, X_test, test_ids, top_features = preprocess_data(train_df, test_df)

    # Feature selection with RFE
    base_model = XGBClassifier(random_state=42)
    rfe = RFE(estimator=base_model, n_features_to_select=35)
    X = pd.DataFrame(rfe.fit_transform(X, y), columns=X.columns[rfe.support_], index=X.index)
    X_test = pd.DataFrame(rfe.transform(X_test), columns=X_test.columns[rfe.support_], index=X_test.index)

    # Split data for validation
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train ensemble model
    print("Training ensemble model...")
    model = make_pipeline(
        SimpleImputer(strategy='median'),
        StandardScaler(),
        build_ensemble_model()
    )
    model.fit(X_train, y_train)

    # Validate model
    val_probs = model.predict_proba(X_val)[:, 1]
    threshold = optimize_threshold(y_val, val_probs)
    val_preds = (val_probs >= threshold).astype(int)
    val_score = accuracy_score(y_val, val_preds)
    print(f"Validation Accuracy: {val_score:.6f}")

    # Cross-validation with StratifiedKFold
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    print(f"Cross-Validation Accuracy: {cv_scores.mean():.6f} (+/- {cv_scores.std() * 2:.6f})")

    # Generate predictions
    test_probs = model.predict_proba(X_test)[:, 1]
    test_preds = (test_probs >= threshold).astype(int)

    # Create submission
    submission = pd.DataFrame({'ID': test_ids, 'Y': test_probs})
    submission.to_csv('submission_fda_improved fda_31.csv', index=False)
    print("\nSubmission file 'submission_fda_improved fda_31.csv' has been created.")
    print("Submission head:")
    print(submission.head())

if __name__ == "__main__":
    main()

Training ensemble model...
Validation Accuracy: 0.997750
Cross-Validation Accuracy: 0.997470 (+/- 0.000159)

Submission file 'submission_fda_improved fda_31.csv' has been created.
Submission head:
       ID         Y
0  200001  0.000505
1  200002  0.001326
2  200003  0.000381
3  200004  0.000387
4  200005  0.000789


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

def load_data():
    """Load the competition data"""
    train_df = pd.read_csv(r"fda_trainingset.csv")
    test_df = pd.read_csv(r"fda_testset.csv")
    return train_df, test_df

 


First Loading the dataset and importing necessary libraries

In [None]:
def enhanced_feature_engineering(df, is_train=True, numeric_cols=None, top_features=None):
    """Add advanced feature engineering"""
    if is_train:
        numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
        if 'ID' in numeric_cols:
            numeric_cols.remove('ID')
        if 'Y' in numeric_cols:
            numeric_cols.remove('Y')

    # Interaction features
    for i, col1 in enumerate(numeric_cols[:10]):
        for col2 in numeric_cols[i+1:i+5]:
            df[f'{col1}_x_{col2}'] = df[col1] * df[col2]
            df[f'{col1}_div_{col2}'] = df[col1] / (df[col2] + 1e-6)

    # Statistical features
    df['row_sum'] = df[numeric_cols].sum(axis=1)
    df['row_mean'] = df[numeric_cols].mean(axis=1)
    df['row_std'] = df[numeric_cols].std(axis=1)

    # Polynomial features for top 5 numeric columns
    if len(numeric_cols) >= 5:
        poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
        imputer = SimpleImputer(strategy='median')
        df[numeric_cols[:5]] = imputer.fit_transform(df[numeric_cols[:5]])
        poly_features = poly.fit_transform(df[numeric_cols[:5]])
        poly_feature_names = poly.get_feature_names_out(numeric_cols[:5])
        poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df.index)
        df = pd.concat([df, poly_df], axis=1)

    # Feature importance-based interactions (if top_features provided)
    if top_features and len(top_features) >= 2:
        for i, col1 in enumerate(top_features[:2]):
            for col2 in top_features[i+1:i+3]:
                df[f'imp_{col1}_x_{col2}'] = df[col1] * df[col2]

    return df, numeric_cols



Feature engineering:
. For the top 10 numeric columns, creates pairwise interaction features (multiplication and division) with the next 4 columns.
. For the top 5 numeric columns, imputes missing values (median strategy) and creates polynomial features (degree 2, including squares and interactions) using PolynomialFeatures.
. If top_features (from XGBoost importance) is provided, creates interaction features (multiplications) for the top 2 features paired with the next 2.

In [None]:
<!-- def preprocess_data(train_df, test_df):
    """Preprocess data including encoding and feature engineering"""
    X = train_df.drop(columns=['ID', 'Y'])
    y = train_df['Y']
    test_ids = test_df['ID']
    X_test = test_df.drop(columns=['ID'])

    # Convert to float32 to save memory
    X = X.astype(np.float32)
    X_test = X_test.astype(np.float32)

    # Encode categorical variables
    label_encoders = {}
    for column in X.select_dtypes(include=['object']).columns:
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column].astype(str))
        X_test[column] = le.transform(X_test[column].astype(str))
        label_encoders[column] = le

    # Initial feature engineering to get feature importance
    temp_pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), XGBClassifier(random_state=42))
    temp_pipeline.fit(X, y)
    feature_importance = pd.Series(temp_pipeline.named_steps['xgbclassifier'].feature_importances_, index=X.columns)
    top_features = feature_importance.nlargest(5).index.tolist()

    # Full feature engineering
    X, numeric_cols = enhanced_feature_engineering(X, is_train=True, top_features=top_features)
    X_test, _ = enhanced_feature_engineering(X_test, is_train=False, numeric_cols=numeric_cols, top_features=top_features)

    return X, y, X_test, test_ids, top_features
 -->



Pre processing:
. Seperate fetures and target drops ID and Y from train_df to create X (features) and extracts y (target).
. Converts X and X_test to float32 to reduce memory usage.
. Encode Categorical Variables identifies categorical columns (type object) and applies LabelEncoder to convert them to numeric values.
. Uses a temporary pipeline (imputation, scaling, XGBoost) to fit the training data and extract feature importances and identifies the top 5 features (top_features) for use in feature engineering.
. Applies enhanced_feature_engineering to both X (training) and X_test (test), passing top_features for additional interactions.

In [None]:
<!-- def build_ensemble_model():
    """Create a simplified ensemble of XGBoost and LightGBM"""
    xgb = XGBClassifier(
        n_estimators=350,
        learning_rate=0.02,
        max_depth=4,
        subsample=0.85,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='logloss',
        scale_pos_weight=1.2  # Adjusted for class imbalance
    )
    lgbm = LGBMClassifier(
        n_estimators=350,
        learning_rate=0.02,
        max_depth=4,
        subsample=0.85,
        colsample_bytree=0.8,
        random_state=42,
        verbose=-1,
        scale_pos_weight=1.2
    )
    return VotingClassifier(
        estimators=[('xgb', xgb), ('lgbm', lgbm)],
        voting='soft',
        weights=[0.6, 0.4]  # Favor XGBoost slightly based on performance
    ) -->



. Builds an ensemble model combining XGBoost and LightGBM using a soft voting strategy.
. Parameters:

XGBoost:
n_estimators=350: Number of boosting rounds
learning_rate=0.02: Step size for updates.
max_depth=4: Maximum tree depth to control overfitting.
subsample=0.85: Fraction of samples used per boosting round.
colsample_bytree=0.8: Fraction of features used per tree.
random_state=42: For reproducibility.
eval_metric='logloss': Evaluation metric for optimization.
scale_pos_weight=1.2: Adjusts for class imbalance (likely a slight imbalance in the dataset).

LightGBM Model (lgbm):
LGBMClassifier with similar parameters to XGBoost, plus:
verbose=-1: Suppresses LightGBM logs.

VotingClassifier:
Combines xgb and lgbm using voting='soft' (averages predicted probabilities).
weights=[0.6, 0.4]: Gives more weight to XGBoost based on expected performance.

In [None]:
<!-- def optimize_threshold(y_true, y_prob):
    """Optimize classification threshold for maximum accuracy"""
    thresholds = np.arange(0.1, 0.9, 0.01)
    best_threshold = 0.5
    best_score = 0
    for threshold in thresholds:
        y_pred = (y_prob >= threshold).astype(int)
        score = accuracy_score(y_true, y_pred)
        if score > best_score:
            best_score = score
            best_threshold = threshold
    return best_threshold -->


 Optimizes the classification threshold to maximize accuracy.
. Tests thresholds from 0.1 to 0.9 (step size 0.01).
. For each threshold, converts probabilities (y_prob) to binary predictions (y_pred) and computes accuracy against true labels (y_true).
. Keeps the threshold with the highest accuracy.
. Returns the best threshold.

In [None]:
<!-- def main():
    # Load data
    train_df, test_df = load_data()

    # Preprocess data
    X, y, X_test, test_ids, top_features = preprocess_data(train_df, test_df)

    # Feature selection with RFE
    base_model = XGBClassifier(random_state=42)
    rfe = RFE(estimator=base_model, n_features_to_select=35)
    X = pd.DataFrame(rfe.fit_transform(X, y), columns=X.columns[rfe.support_], index=X.index)
    X_test = pd.DataFrame(rfe.transform(X_test), columns=X_test.columns[rfe.support_], index=X_test.index)

    # Split data for validation
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train ensemble model
    print("Training ensemble model...")
    model = make_pipeline(
        SimpleImputer(strategy='median'),
        StandardScaler(),
        build_ensemble_model()
    )
    model.fit(X_train, y_train)

    # Validate model
    val_probs = model.predict_proba(X_val)[:, 1]
    threshold = optimize_threshold(y_val, val_probs)
    val_preds = (val_probs >= threshold).astype(int)
    val_score = accuracy_score(y_val, val_preds)
    print(f"Validation Accuracy: {val_score:.6f}")

    # Cross-validation with StratifiedKFold
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
    print(f"Cross-Validation Accuracy: {cv_scores.mean():.6f} (+/- {cv_scores.std() * 2:.6f})")

    # Generate predictions
    test_probs = model.predict_proba(X_test)[:, 1]
    test_preds = (test_probs >= threshold).astype(int)

    # Create submission
    submission = pd.DataFrame({'ID': test_ids, 'Y': test_probs})
    submission.to_csv('submission_fda_improved fda_31.csv', index=False)
    print("\nSubmission file 'submission_fda_improved fda_31.csv' has been created.")
    print("Submission head:")
    print(submission.head()) -->


The main function orchestrates the entire pipeline from data loading to submission file creation.
. load the training and test datasets.
. Preprocess Data to encode, optimize memory, and engineer features for both training and test sets.
. Uses RFE with an XGBoost base model to select the top 35 features.
. Updates X and X_test to include only the selected features.
. Splits the training data into training (X_train, y_train) and validation (X_val, y_val) sets (80-20 split, random_state=42).
. Creates a pipeline with imputation (median), scaling, and the ensemble model from build_ensemble_model() and fits the model on the training data.
. Optimizes the threshold using optimize_threshold() and converts probabilities to binary predictions.
. Performs 5-fold cross-validation with StratifiedKFold to ensure balanced folds and reports the mean cross-validation accuracy (0.997470) 
. Predicts probabilities on the test set and applies the optimized threshold to get binary predictions.
. Creates submission

In [None]:
<!-- if __name__ == "__main__":
    main() -->



. Calls the main() function to execute the entire pipeline.