https://www.kaggle.com/competitions/amex-default-prediction/data

In [1]:
import logging
import warnings
import sys
import json
import gc
import joblib  # Import joblib for model saving
from io import StringIO
from datetime import datetime

In [2]:
import os
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import SimpleImputer
# from imblearn.under_sampling import RandomUnderSampler

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
def calculate_correlation_with_target(features_df, target_series, variance_threshold=0.0):
    """
    Calculate the correlation of numeric columns in a features DataFrame with a target Series
    and perform Variance Threshold feature selection.
    
    Parameters:
    features_df (pd.DataFrame): The features DataFrame.
    target_series (pd.Series): The target Series.
    variance_threshold (float): Variance threshold for feature selection. Features with variance
        below this threshold will be removed. Default is 0.0 (no threshold).

    Returns:
    pd.Series: A Series containing the correlation coefficients sorted by absolute values.
    """
    # Select only numeric columns from the features DataFrame
    numeric_features = features_df.select_dtypes(include=['number'])
    
    # Calculate the correlation and sort the result by absolute values in descending order
    correlation_series = numeric_features.corrwith(target_series)
    absolute_correlation_series = correlation_series.abs()
    
    # Apply Variance Threshold to filter features
    if variance_threshold > 0.0:
        selector = VarianceThreshold(threshold=variance_threshold)
        numeric_features = selector.fit_transform(numeric_features)
        # Update correlation series to match the selected features
        correlation_series = pd.Series(selector.inverse_transform(correlation_series.values.reshape(1, -1))[0], index=numeric_features.columns)
    
    # Sort the DataFrame by absolute values
    correlation_series = correlation_series.sort_values(ascending=False)
    
    return correlation_series

In [5]:
# Define the directory path
data_dir = os.path.join('C:\\', 'Users', 'KonuTech', 'zoomcamp-capstone-01', 'data')
data_dir

'C:\\Users\\KonuTech\\zoomcamp-capstone-01\\data'

In [6]:
train_data_parquet_file = 'train_data_downsampled.parquet'

In [7]:
# Load the training data
train_data = pd.read_parquet(os.path.join(data_dir, train_data_parquet_file))

In [8]:
train_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2755738 entries, 541332 to 2614482
Data columns (total 191 columns):
 #    Column       Dtype  
---   ------       -----  
 0    customer_ID  object 
 1    S_2          object 
 2    P_2          float64
 3    D_39         float64
 4    B_1          float64
 5    B_2          float64
 6    R_1          float64
 7    S_3          float64
 8    D_41         float64
 9    B_3          float64
 10   D_42         float64
 11   D_43         float64
 12   D_44         float64
 13   B_4          float64
 14   D_45         float64
 15   B_5          float64
 16   R_2          float64
 17   D_46         float64
 18   D_47         float64
 19   D_48         float64
 20   D_49         float64
 21   B_6          float64
 22   B_7          float64
 23   B_8          float64
 24   D_50         float64
 25   D_51         float64
 26   B_9          float64
 27   R_3          float64
 28   D_52         float64
 29   P_3          float64
 30   B_10         flo

### splitting

In [9]:
# Split the dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data.iloc[:, :-1], train_data["target"], test_size=0.2, random_state=42)

In [10]:
# Explicitly call the garbage collector to free up memory
gc.collect()

36

In [11]:
X_train.shape

(2204590, 190)

In [12]:
X_val.shape

(551148, 190)

In [13]:
y_train.shape

(2204590,)

In [14]:
y_val.shape

(551148,)

In [15]:
correlation_result = calculate_correlation_with_target(X_train, y_train)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [16]:
top_ten_correlations = list(correlation_result[:15].index)
top_ten_correlations

['D_48',
 'D_55',
 'B_9',
 'D_58',
 'D_75',
 'D_44',
 'B_7',
 'B_23',
 'B_16',
 'B_3',
 'D_74',
 'B_38',
 'B_20',
 'B_4',
 'B_19']

In [17]:
print(f"Step 1: Excluding 'customer_ID', 'S_2' and creating a DictVectorizer")

# Exclude 'customer_ID' column and create a DictVectorizer
dict_vectorizer = DictVectorizer(sparse=False)

X_train_dict = X_train[top_ten_correlations].to_dict(orient='records')
X_val_dict = X_val[top_ten_correlations].to_dict(orient='records')

X_train_encoded = dict_vectorizer.fit_transform(X_train_dict)
X_val_encoded = dict_vectorizer.transform(X_val_dict)

Step 1: Excluding 'customer_ID', 'S_2' and creating a DictVectorizer


In [18]:
len(dict_vectorizer.get_feature_names_out())

15

In [19]:
dict_vectorizer.get_feature_names_out()

array(['B_16', 'B_19', 'B_20', 'B_23', 'B_3', 'B_38', 'B_4', 'B_7', 'B_9',
       'D_44', 'D_48', 'D_55', 'D_58', 'D_74', 'D_75'], dtype=object)

In [20]:
X_train_encoded[0]

array([8.11689337e-03, 4.33485879e-03, 8.42621567e-04, 8.82114270e-03,
       1.34561409e-02, 1.00000000e+00, 1.01981802e-03, 8.47173325e-03,
       4.46140447e-03, 5.29108439e-03, 5.10770290e-02, 1.64400025e-01,
       7.33613367e-04, 2.76507176e-04, 2.07962957e-03])

In [21]:
gc.collect()

18

In [22]:
classifiers = [
    ('LogisticRegression', LogisticRegression(n_jobs=-1), {
        'classifier__C': [0.1, 1.0],
        'classifier__penalty': ['l1', 'l2'],
    }),
    # ('RandomForest', RandomForestClassifier(n_jobs=-1), {
    #     'classifier__n_estimators': [200],  # [100, 200],
    #     'classifier__max_depth': [6],  # [None, 6],
    #     'classifier__min_samples_split': [5],  # [2, 5], Optimized parameter
    #     'classifier__min_samples_leaf': [2]  # [1, 2] Optimized parameter
    # }),
    ('XGBoost', xgb.XGBClassifier(n_jobs=-1), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [3, 5],
        'classifier__min_child_weight': [1, 2]  # Optimized parameter
    })
]

In [24]:
import logging
import json
import gc
import joblib
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from datetime import datetime

# Set the log file name and log level
log_file = 'training_log.log'
log_level = logging.INFO  # You can change the log level as needed (e.g., DEBUG, INFO, WARNING, ERROR)

# Generate a timestamp
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Configure logging
logging.basicConfig(filename=log_file, level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

results = []

for i, (name, classifier, params) in enumerate(classifiers, start=1):
    logging.info(f"Step {i}: Training {name} classifier")

    # Create an RFE model and a pipeline
    rfe = RFE(estimator=classifier, n_features_to_select=10)  # select a number of features to be left

    # Add a step to impute missing values with the median
    imputer = SimpleImputer(strategy='median')  # You can choose a different strategy

    pipeline = Pipeline([
        ('imputer', imputer),  # Add the imputation step
        ('feature_selection', rfe),
        ('classifier', classifier)
    ])

    logging.info(f"Step {i + 1}: Performing hyperparameter tuning")

    # Perform hyperparameter tuning
    grid = GridSearchCV(pipeline, param_grid=params, cv=3, n_jobs=-1, verbose=3)
    grid.fit(X_train_encoded, y_train)

    # Access the feature names used in the best_estimator_
    selected_feature_indices = grid.best_estimator_.named_steps['feature_selection'].get_support(indices=True)
    logging.info(f"Selected Indices of Features for {name}: {selected_feature_indices}")

    # Get selected features labels
    # feature_names = X_train_encoded.columns
    # Get the feature names from the DictVectorizer
    feature_names = dict_vectorizer.get_feature_names_out()

    # Get the selected feature names
    selected_features = [feature_names[i] for i in selected_feature_indices]

    logging.info(f"Selected Features for {name}: {selected_features}")

    # Log the output of grid.best_estimator_
    logging.info(f"Best Estimator for {name}: {grid.best_estimator_}")

    # Save the best model as a .bin file with the timestamp
    best_model = grid.best_estimator_
    model_filename = f"{name}_{timestamp}.bin"  # Include the timestamp in the filename
    joblib.dump(best_model, model_filename)

    logging.info(f"Step {i + 2}: Saving the best model for {name} as {model_filename}")

    # Explicit garbage collection
    gc.collect()

    logging.info(f"Step {i + 3}: Evaluating the best model on the validation set using F1 score")

    # Evaluate the best model on the validation set using F1 score
    y_pred = grid.predict(X_val_encoded)
    f1 = f1_score(y_val, y_pred)

    results.append({'name': name, 'f1': f1, 'best_params': grid.best_params_})

# Create the JSON filename with the timestamp
json_file_name = f'grid_search_results_{timestamp}.json'

# Save the results to the JSON file
with open(json_file_name, 'w') as file:
    json.dump(results, file, indent=4)

# Print the logs to the terminal
log_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
log_handler = logging.StreamHandler()
log_handler.setFormatter(log_formatter)
root_logger = logging.getLogger()
root_logger.addHandler(log_handler)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


6 fits failed out of a total of 12.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\konutech\zoomcamp-capstone-01\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\konutech\zoomcamp-capstone-01\venv\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\users\konutech\zoomcamp-capstone-01\venv\lib\site-packages\sklearn\pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\users\konutech\zoomcamp-capstone-01\venv\lib\site-packages

Fitting 3 folds for each of 8 candidates, totalling 24 fits
