In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.feature_selection import SelectFromModel
import os

In [None]:
# Load the data
train_data = pd.read_csv('Train-set.csv')
test_data = pd.read_csv('Test-set.csv')

# Separate the 'Target' column from the train data
y_train = train_data['Target']
train_data.drop('Target', axis=1, inplace=True)

# Combine train and test data for preprocessing
all_data = pd.concat([train_data, test_data], axis=0)


In [None]:
# Feature Engineering: Extract day of the week and create a weekend indicator
try:
    all_data['day'] = pd.to_datetime(all_data['day'])
    all_data['day_of_week'] = all_data['day'].dt.dayofweek
    all_data['is_weekend'] = all_data['day_of_week'].isin([5, 6]).astype(int)
    all_data.drop('day', axis=1, inplace=True)
except (ValueError, OverflowError, pd._libs.tslibs.np_datetime.OutOfBoundsDatetime):
    # Handle errors due to invalid date formats
    all_data['day'] = pd.to_datetime(all_data['day'], errors='coerce')
    all_data['day_of_week'] = all_data['day'].dt.dayofweek
    all_data['is_weekend'] = all_data['day_of_week'].isin([5, 6]).astype(int)
    all_data.drop('day', axis=1, inplace=True)

In [None]:
# Separate numeric and categorical columns
numeric_cols = all_data.select_dtypes(include=[np.number]).columns
categorical_cols = all_data.select_dtypes(include=[object]).columns

# Create transformers for numeric and categorical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocess the data using the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

X_all_preprocessed = preprocessor.fit_transform(all_data)


In [None]:
# Handle class imbalance using BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_all_preprocessed[:train_data.shape[0]], y_train)



In [None]:
# Create and train optimized models
optimized_rf_model = RandomForestClassifier(n_estimators=150, max_depth=9, random_state=42)
optimized_gb_model = GradientBoostingClassifier(n_estimators=160, learning_rate=0.05, max_depth=7, random_state=42)
optimized_lgbm_model = LGBMClassifier(n_estimators=180, learning_rate=0.1, max_depth=5, random_state=42)


In [None]:
# Feature selection using SelectFromModel
feature_selector = SelectFromModel(optimized_rf_model)
X_train_resampled_selected = feature_selector.fit_transform(X_train_resampled, y_train_resampled)


In [None]:
# Get selected feature indices
selected_feature_indices = feature_selector.get_support(indices=True)


In [None]:
# Get selected feature names from the preprocessed data
selected_feature_names = [all_data.columns[i] for i in selected_feature_indices]

# Print the selected feature names
print("Selected Feature Names:")
print(selected_feature_names)

In [None]:
# Apply feature selection to test data as well
X_test_selected = X_all_preprocessed[train_data.shape[0]:, selected_feature_indices]


In [None]:
# Train optimized models on selected features
optimized_rf_model.fit(X_train_resampled_selected, y_train_resampled)
optimized_gb_model.fit(X_train_resampled_selected, y_train_resampled)
optimized_lgbm_model.fit(X_train_resampled_selected, y_train_resampled)


In [None]:
# Get predictions using optimized models on selected features
test_predictions_rf = optimized_rf_model.predict_proba(X_test_selected)[:, 1]
test_predictions_gb = optimized_gb_model.predict_proba(X_test_selected)[:, 1]
test_predictions_lgbm = optimized_lgbm_model.predict_proba(X_test_selected)[:, 1]


In [None]:
# Combine the predictions using weighted averaging
ensemble_predictions = (0.4 * test_predictions_rf) + (0.4 * test_predictions_gb) + (0.2 * test_predictions_lgbm)
threshold = 0.5
binary_predictions = (ensemble_predictions >= threshold).astype(int)

In [None]:
# Get the 'id' values from the test_data DataFrame
submission_ids = test_data['id']

# Create binary predictions based on a threshold (e.g., 0.5)
threshold = 0.5
binary_predictions = (ensemble_predictions >= threshold).astype(int)

# Create the submission DataFrame with 'id' and binary 'Target' values
submission_df = pd.DataFrame({'id': submission_ids, 'Target': binary_predictions})

# Save the submission file to CSV
submission_df.to_csv('submission_binary_updated.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.feature_selection import SelectFromModel

# Load the data
train_data = pd.read_csv('Train-set.csv')
test_data = pd.read_csv('Test-set.csv')

# Separate the 'Target' column from the train data
y_train = train_data['Target']
train_data.drop('Target', axis=1, inplace=True)

# Combine train and test data for preprocessing
all_data = pd.concat([train_data, test_data], axis=0)

# Separate numeric and categorical columns
numeric_cols = all_data.select_dtypes(include=[np.number]).columns
categorical_cols = all_data.select_dtypes(include=[object]).columns

# Create transformers for numeric and categorical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocess the data using the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

X_all_preprocessed = preprocessor.fit_transform(all_data)

# Handle class imbalance using BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_all_preprocessed[:train_data.shape[0]], y_train)

# Initialize and train a feature selection model
selector = SelectFromModel(RandomForestClassifier(n_estimators=150, max_depth=9, random_state=42))
selector.fit(X_train_resampled, y_train_resampled)

# Get selected feature indices
selected_feature_indices = selector.get_support(indices=True)

# Get selected feature names
selected_numeric_feature_names = np.array(numeric_cols)[selected_feature_indices[selected_feature_indices < len(numeric_cols)]]

# Get the OneHotEncoder used for categorical columns
categorical_encoder = preprocessor.named_transformers_['cat']['encoder']

# Get selected categorical feature names
selected_categorical_feature_names = categorical_encoder.get_feature_names(input_features=categorical_cols)
selected_categorical_feature_names = [name for i, name in enumerate(selected_categorical_feature_names) if i + len(numeric_cols) in selected_feature_indices]

# Concatenate selected feature names
selected_feature_names = np.concatenate((selected_numeric_feature_names, selected_categorical_feature_names))

# Print the selected feature names
print("Selected Feature Names:")
print(selected_feature_names)


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.feature_selection import SelectFromModel

# Load the data
train_data = pd.read_csv('Train-set.csv')
test_data = pd.read_csv('Test-set.csv')

# Separate the 'Target' column from the train data
y_train = train_data['Target']
train_data.drop('Target', axis=1, inplace=True)

# Combine train and test data for preprocessing
all_data = pd.concat([train_data, test_data], axis=0)

# Separate numeric and categorical columns
numeric_cols = all_data.select_dtypes(include=[np.number]).columns
categorical_cols = all_data.select_dtypes(include=[object]).columns

# Create transformers for numeric and categorical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocess the data using the column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_cols),
    ('cat', categorical_transformer, categorical_cols)
])

X_all_preprocessed = preprocessor.fit_transform(all_data)

# Handle class imbalance using BorderlineSMOTE
smote = BorderlineSMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_all_preprocessed[:train_data.shape[0]], y_train)

# Initialize and train a feature selection model
selector = SelectFromModel(RandomForestClassifier(n_estimators=150, max_depth=9, random_state=42))
selector.fit(X_train_resampled, y_train_resampled)

# Get selected feature indices
selected_feature_indices = selector.get_support(indices=True)

# Get selected feature names
selected_numeric_feature_names = np.array(numeric_cols)[selected_feature_indices[selected_feature_indices < len(numeric_cols)]]

# Retrieve the OneHotEncoder used for categorical columns
categorical_encoder = preprocessor.named_transformers_['cat']['encoder']




In [3]:
# Retrieve the OneHotEncoder used for categorical columns
categorical_encoder = preprocessor.named_transformers_['cat']['encoder']

In [4]:
# Transform and get the column names after one-hot encoding
onehot_columns = categorical_encoder.get_feature_names_out(input_features=categorical_cols)


In [5]:
# Get selected categorical feature names
selected_categorical_feature_names = []
for i, col_idx in enumerate(selected_feature_indices):
    if col_idx >= len(numeric_cols):
        selected_categorical_feature_names.append(onehot_columns[col_idx - len(numeric_cols)])

In [6]:
# Concatenate selected feature names
selected_feature_names = np.concatenate((selected_numeric_feature_names, selected_categorical_feature_names))


In [7]:
# Print the selected feature names
print("Selected Feature Names:")
print(selected_feature_names)

Selected Feature Names:
['Unnamed: 0' 'age' 'duration' 'campaign' 'pdays' 'previous'
 'job_blue-collar' 'marital_single' 'default_no' 'housing_no'
 'housing_yes' 'loan_no' 'loan_yes' 'contact_cellular' 'contact_telephone'
 'contact_unknown' 'day_may' 'month_may' 'poutcome_success']
