In [1]:
# Banking Dataset Classification

# Import necessary libraries
import pandas as pd  # Importing pandas library for data manipulation and analysis
import numpy as np  # Importing numpy library for numerical operations
from sklearn.model_selection import train_test_split, RandomizedSearchCV  # Importing functions for data splitting and hyperparameter tuning
from sklearn.preprocessing import RobustScaler, OneHotEncoder  # Importing preprocessing tools for scaling and one-hot encoding
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier  # Importing ensemble classifiers
from lightgbm import LGBMClassifier  # Importing LightGBM classifier
from sklearn.impute import SimpleImputer  # Importing SimpleImputer for handling missing data
from sklearn.compose import ColumnTransformer  # Importing ColumnTransformer for column-wise transformations
from sklearn.pipeline import Pipeline  # Importing Pipeline for creating a processing pipeline
from sklearn.metrics import accuracy_score  # Importing accuracy_score for model evaluation
from imblearn.over_sampling import BorderlineSMOTE  # Importing BorderlineSMOTE for handling class imbalance
import os  # Importing operating system utilities for file handling

# Load the training and test data
train_data = pd.read_csv('Train-set.csv')  # Load the training data from a CSV file
test_data = pd.read_csv('Test-set.csv')    # Load the test data from a CSV file

# Separate the 'Target' column from the train data
y_train = train_data['Target']  # Storing the target labels in 'y_train'
train_data.drop('Target', axis=1, inplace=True)  # Removing the target column from the training data

# Combine train and test data for preprocessing
all_data = pd.concat([train_data, test_data], axis=0)  # Combining train and test data for combined preprocessing

# Feature Engineering: Extract day of the week and create a weekend indicator
try:
    # Convert 'day' column to datetime
    all_data['day'] = pd.to_datetime(all_data['day'])
    # Extract day of the week (0-6) and create 'day_of_week' feature
    all_data['day_of_week'] = all_data['day'].dt.dayofweek
    # Create binary indicator for the weekend (Saturday and Sunday)
    all_data['is_weekend'] = all_data['day_of_week'].isin([5, 6]).astype(int)
    # Drop the original 'day' column
    all_data.drop('day', axis=1, inplace=True)
except (ValueError, OverflowError, pd._libs.tslibs.np_datetime.OutOfBoundsDatetime):
    # Handle errors due to invalid date formats
    all_data['day'] = pd.to_datetime(all_data['day'], errors='coerce')
    all_data['day_of_week'] = all_data['day'].dt.dayofweek
    all_data['is_weekend'] = all_data['day_of_week'].isin([5, 6]).astype(int)
    all_data.drop('day', axis=1, inplace=True)

# Separate numeric and categorical columns
numeric_cols = all_data.select_dtypes(include=[np.number]).columns  # Identifying numeric columns
categorical_cols = all_data.select_dtypes(include=[object]).columns  # Identifying categorical columns

# Create transformers for numeric and categorical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
    ('scaler', RobustScaler())  # Scale features using robust scaling
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent value
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Encode categorical variables using one-hot encoding
])

# Preprocess the data using the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),  # Apply numeric transformer to numeric columns
    ('cat', categorical_transformer, categorical_cols)  # Apply categorical transformer to categorical columns
])

X_all_preprocessed = preprocessor.fit_transform(all_data)  # Apply preprocessing to all data

# Handle class imbalance using BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)  # Initialize BorderlineSMOTE for oversampling
X_train_resampled, y_train_resampled = smote.fit_resample(X_all_preprocessed[:train_data.shape[0]], y_train)
# Apply BorderlineSMOTE to balance classes in the training data

# Create and train optimized models
optimized_rf_model = RandomForestClassifier(n_estimators=150, max_depth=9, random_state=42)
# Initialize RandomForestClassifier with optimized hyperparameters
optimized_gb_model = GradientBoostingClassifier(n_estimators=160, learning_rate=0.05, max_depth=7, random_state=42)
# Initialize GradientBoostingClassifier with optimized hyperparameters
optimized_lgbm_model = LGBMClassifier(n_estimators=180, learning_rate=0.1, max_depth=5, random_state=42)
# Initialize LGBMClassifier with optimized hyperparameters

optimized_rf_model.fit(X_train_resampled, y_train_resampled)  # Train RandomForestClassifier on resampled data
optimized_gb_model.fit(X_train_resampled, y_train_resampled)  # Train GradientBoostingClassifier on resampled data
optimized_lgbm_model.fit(X_train_resampled, y_train_resampled)  # Train LGBMClassifier on resampled data

# Get predictions using optimized models
test_predictions_rf = optimized_rf_model.predict_proba(X_all_preprocessed[train_data.shape[0]:])[:, 1]
# Predict probabilities for class 1 using RandomForestClassifier
test_predictions_gb = optimized_gb_model.predict_proba(X_all_preprocessed[train_data.shape[0]:])[:, 1]
# Predict probabilities for class 1 using GradientBoostingClassifier
test_predictions_lgbm = optimized_lgbm_model.predict_proba(X_all_preprocessed[train_data.shape[0]:])[:, 1]
# Predict probabilities for class 1 using LGBMClassifier

# Combine the predictions using weighted averaging
ensemble_predictions = (0.4 * test_predictions_rf) + (0.4 * test_predictions_gb) + (0.2 * test_predictions_lgbm)
# Weighted average of predictions from all three models
threshold = 0.5  # Set the threshold for converting probabilities to binary predictions
binary_predictions = (ensemble_predictions >= threshold).astype(int)  # Convert probabilities to binary predictions

# Get the 'id' values from the test_data DataFrame
submission_ids = test_data['id']

# Create binary predictions based on a threshold (e.g., 0.5)
threshold = 0.5
binary_predictions = (ensemble_predictions >= threshold).astype(int)

# Create the submission DataFrame with 'id' and binary 'Target' values
submission_df = pd.DataFrame({'id': submission_ids, 'Target': binary_predictions})

# Save the submission file to CSV
submission_df.to_csv('submission_binary.csv', index=False)


  all_data['day'] = pd.to_datetime(all_data['day'])
  all_data['day'] = pd.to_datetime(all_data['day'], errors='coerce')


[LightGBM] [Info] Number of positive: 48433, number of negative: 48433
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16946
[LightGBM] [Info] Number of data points in the train set: 96866, number of used features: 69
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [6]:
# Define the column names for the preprocessed data
numeric_cols = all_data.select_dtypes(include=[np.number]).columns
categorical_cols = all_data.select_dtypes(include=[object]).columns
encoded_cols = preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_cols)
all_columns = list(numeric_cols) + list(encoded_cols)

# Convert the preprocessed data array to a DataFrame with proper column names
preprocessed_data = pd.DataFrame(X_all_preprocessed, columns=all_columns)

# Save the preprocessed and feature-engineered data to a CSV file
preprocessed_data.to_csv('preprocessed_data.csv', index=False)

ValueError: Shape of passed values is (78161, 1), indices imply (78161, 71)