In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px

train_features = pd.read_csv("./train_NN.csv")
train_labels = pd.read_csv("./train_labels_NN.csv")
test_features = pd.read_csv("test_NN.csv")

In [None]:
param_grid = {
    'boosting_type' : ['gbdt', 'dart', 'goss', 'rf'],
    'num_leaves': [31, 50, 100],                # Maximum number of leaves in one tree
    'max_depth': [-1, 10, 20, 30],               # Maximum depth of tree nodes
    'learning_rate': [0.01, 0.05, 0.1, 0.2],    # Step size shrinkage
    'n_estimators': [50, 100, 200],              # Number of boosting rounds
    'subsample_for_bin': [200000, 300000],      # Number of samples for constructing bins
    'colsample_bytree': [0.8, 1.0],             # Fraction of features for each boosting round
    'reg_alpha': [0.0, 0.1, 0.5],               # L1 regularization term on weights
    'reg_lambda': [0.0, 0.1, 0.5],              # L2 regularization term on weights
    'min_split_gain': [0.0, 0.1, 0.5],          # Minimum loss reduction for further partition
    'min_child_samples': [10, 20, 30],          # Minimum number of data needed in a child
    'subsample': [0.8, 1.0],                    # Fraction of samples used for training
}

In [None]:
# starting hyperparameters
# boosting_type= 'gbdt',        # Boosting type: 'gbdt' (Gradient Boosting Decision Tree)
# num_leaves= 31,               # Maximum number of leaves in one tree
# max_depth= -1,                # Maximum depth of tree nodes (set to -1 for unlimited depth)
# learning_rate= 0.1,           # Step size shrinkage to prevent overfitting
# n_estimators= 100,            # Number of boosting rounds (trees to build)
# subsample_for_bin= 2000,      # Number of samples for constructing bins
# objective= 'binary',          # Objective function: 'binary' for binary classification
# metric= 'binary_logloss',     # Evaluation metric: 'binary_logloss' for binary classification
# colsample_bytree= 1.0,        # Fraction of features to be used for each boosting round
# reg_alpha= 0.0,               # L1 regularization term on weights
# reg_lambda= 0.0,              # L2 regularization term on weights
# min_split_gain= 0.0,          # Minimum loss reduction required to make a further partition
# min_child_samples= 20,        # Minimum number of data needed in a child (leaf)
# subsample= 1.0,               # Fraction of samples used for training (set to 1.0 for no subsampling)
# random_state= 42

In [140]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(
    boosting_type= 'gbdt',        # Boosting type: 'gbdt' (Gradient Boosting Decision Tree)
    num_leaves= 80,               # Maximum number of leaves in one tree
    max_depth= 50,                # Maximum depth of tree nodes (set to -1 for unlimited depth)
    learning_rate= 0.1,           # Step size shrinkage to prevent overfitting
    n_estimators= 700,            # Number of boosting rounds (trees to build)
    subsample_for_bin= 2000,      # Number of samples for constructing bins
    objective= 'binary',          # Objective function: 'binary' for binary classification
    metric= 'binary_logloss',     # Evaluation metric: 'binary_logloss' for binary classification
    colsample_bytree= 0.8,        # Fraction of features to be used for each boosting round
    reg_alpha= 0.3,               # L1 regularization term on weights
    reg_lambda= 0.3,              # L2 regularization term on weights
    min_split_gain= 0.0,          # Minimum loss reduction required to make a further partition
    min_child_samples= 20,        # Minimum number of data needed in a child (leaf)
    subsample= 1.0,               # Fraction of samples used for training (set to 1.0 for no subsampling)
    random_state= 42              # Seed for random number generation
)

lgbm_model = lgbm.fit(train_features, train_labels)
lgbm_pred = pd.DataFrame(lgbm_model.predict(test_features))
# lgbm_pred.head()
lgbm_pred[0] = lgbm_pred[0].astype(int)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 88304, number of negative: 88304
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5893
[LightGBM] [Info] Number of data points in the train set: 176608, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [145]:
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier, VotingClassifier, RandomForestClassifier, BaggingClassifier

xgboost = XGBClassifier(
    n_estimators = 800,
    max_depth = 13,
    learning_rate = 0.08,
    gamma = 0.5,
    reg_lambda = 10,
    min_child_weight = 7,
    # objective = "reg:squaredlogerror"
    # colsample_bytree = 0.45
    # eval_metric = "logloss"
    scale_pos_weight = 1,
    random_state=42
)

# Create a stacking classifier
stacking_classifier = StackingClassifier(
    estimators=[
        ('xgboost', xgboost),
        ('lgbm', lgbm),
    ],
    final_estimator=RandomForestClassifier(n_estimators=100, random_state=42),
    stack_method='auto',  # Auto selects the appropriate method (meta-features or probabilities)
)

In [146]:
# Train the stacking classifier
stacking_classifier.fit(train_features, train_labels)

# Make predictions on the test set
y_pred_stacking = stacking_classifier.predict(test_features)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


[LightGBM] [Info] Number of positive: 88304, number of negative: 88304
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.038534 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5893
[LightGBM] [Info] Number of data points in the train set: 176608, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 70643, number of negative: 70643
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008278 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5930
[LightGBM] [Info] Number of data points in the train set: 141286, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 70643, number of negative: 70643
[LightGBM] [Info] Auto-choosing col-wise multi-th

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [149]:
y_pred_stacking = pd.DataFrame(y_pred_stacking)
y_pred_stacking = y_pred_stacking.astype(int)
y_pred_stacking.head()

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0


In [153]:
from sklearn.ensemble import BaggingClassifier
bagging_classifier = BaggingClassifier(
    base_estimator=VotingClassifier(estimators=[
        ('xgboost', xgboost),
        ('lgbm', lgbm),
        # ('catboost', catboost),
    ], voting='soft'),
    n_estimators=800,  # Number of base estimators (you can adjust this)
    random_state=42
)

In [154]:
# Train the stacking classifier
bagging_classifier.fit(train_features, train_labels)

# Make predictions on the test set
y_pred_bagging = bagging_classifier.predict(test_features)

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Number of positive: 88304, number of negative: 88304
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.016808 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5907
[LightGBM] [Info] Number of data points in the train set: 176608, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501755 -> initscore=0.007021
[LightGBM] [Info] Start training from score 0.007021
[LightGBM] [Info] Number of positive: 88304, number of negative: 88304
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037604 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5898
[LightGBM] [Info] Number of data points in the train set: 176608, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498805 -> initscore=-0.004779
[LightGBM] [Info] Start training from score -0.004779
[LightGBM] [

KeyboardInterrupt: 

In [150]:
# writing predictions to submission.csv
fin_sub = pd.read_csv("./sample_submissions.csv")
fin_sub["Task"] = y_pred_bagging
fin_sub.head()

Unnamed: 0,ID,Task
0,100721,1
1,30234,0
2,28624,0
3,31173,0
4,573,0


In [151]:
fin_sub.to_csv('./lgbm.csv', encoding='utf-8', index=False)

In [None]:
training_data.head()
train_labels1 = training_data['Task']
train_features1 = training_data.drop(columns=['Task'])
train_features1.head()
train_labels1.head()

test_data.head()
test_features1 = test_data.drop(columns=['ID'])
test_features1.head()

# Find indices of categorical columns
categorical_indices = train_features1.select_dtypes(include=['object']).columns
categorical_indices = list(train_features1.columns.get_loc(col) for col in categorical_indices)
print(categorical_indices)

train_features1 = train_features1.astype(str)
test_features11 = test_features1.astype(str)
train_labels1 = train_labels1.astype(int)

In [None]:
from catboost import CatBoostClassifier
catboost_model = CatBoostClassifier(
    iterations=500,
    depth=10,
    learning_rate=0.1,
    loss_function='Logloss',
    random_seed=42,
    verbose=100,
    # Add more parameters here
    # border_count=254,           # The number of splits for numerical features
    l2_leaf_reg=3,              # L2 regularization coefficient for leaf values
    bagging_temperature=1.0,    # Controls the intensity of Bayesian bagging
    random_strength=1.0,        # The randomness intensity for scoring splits
    boosting_type='Ordered',    # 'Ordered' for single-pass ordered boosting, or 'Plain' for plain boosting
    # bootstrap_type='Bayesian',  # 'Bayesian' or 'MVS' (Most Voted Sequence)
    leaf_estimation_method='Newton',  # 'Newton' or 'Gradient' for leaf value estimation
    subsample=1,              # Fraction of the training data used for training each tree
    max_bin=255,                # The maximum number of bins for numeric features
    cat_features=categorical_indices      # Indices of categorical features (if applicable)
)

In [None]:
# Train the model
catboost_model.fit(train_features1, train_labels1)

# Make predictions on the test set
cat_pred = catboost_model.predict(test_features1)