https://www.kaggle.com/competitions/amex-default-prediction/data

In [1]:
import logging
import warnings
import sys
import json
import gc
import joblib  # Import joblib for model saving
from io import StringIO

In [2]:
import os
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import SimpleImputer
# from imblearn.under_sampling import RandomUnderSampler

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
def calculate_correlation_with_target(features_df, target_series, variance_threshold=0.0):
    """
    Calculate the correlation of numeric columns in a features DataFrame with a target Series
    and perform Variance Threshold feature selection.
    
    Parameters:
    features_df (pd.DataFrame): The features DataFrame.
    target_series (pd.Series): The target Series.
    variance_threshold (float): Variance threshold for feature selection. Features with variance
        below this threshold will be removed. Default is 0.0 (no threshold).

    Returns:
    pd.Series: A Series containing the correlation coefficients sorted by absolute values.
    """
    # Select only numeric columns from the features DataFrame
    numeric_features = features_df.select_dtypes(include=['number'])
    
    # Calculate the correlation and sort the result by absolute values in descending order
    correlation_series = numeric_features.corrwith(target_series)
    absolute_correlation_series = correlation_series.abs()
    
    # Apply Variance Threshold to filter features
    if variance_threshold > 0.0:
        selector = VarianceThreshold(threshold=variance_threshold)
        numeric_features = selector.fit_transform(numeric_features)
        # Update correlation series to match the selected features
        correlation_series = pd.Series(selector.inverse_transform(correlation_series.values.reshape(1, -1))[0], index=numeric_features.columns)
    
    # Sort the DataFrame by absolute values
    correlation_series = correlation_series.sort_values(ascending=False)
    
    return correlation_series

In [5]:
# Define the directory path
data_dir = os.path.join('C:\\', 'Users', 'KonuTech', 'zoomcamp-capstone-01', 'data')
data_dir

'C:\\Users\\KonuTech\\zoomcamp-capstone-01\\data'

In [6]:
train_data_parquet_file = 'train_data_downsampled.parquet'

In [7]:
# Load the training data
train_data = pd.read_parquet(os.path.join(data_dir, train_data_parquet_file))

In [8]:
train_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2755738 entries, 541332 to 2614482
Data columns (total 191 columns):
 #    Column       Dtype  
---   ------       -----  
 0    customer_ID  object 
 1    S_2          object 
 2    P_2          float64
 3    D_39         float64
 4    B_1          float64
 5    B_2          float64
 6    R_1          float64
 7    S_3          float64
 8    D_41         float64
 9    B_3          float64
 10   D_42         float64
 11   D_43         float64
 12   D_44         float64
 13   B_4          float64
 14   D_45         float64
 15   B_5          float64
 16   R_2          float64
 17   D_46         float64
 18   D_47         float64
 19   D_48         float64
 20   D_49         float64
 21   B_6          float64
 22   B_7          float64
 23   B_8          float64
 24   D_50         float64
 25   D_51         float64
 26   B_9          float64
 27   R_3          float64
 28   D_52         float64
 29   P_3          float64
 30   B_10         flo

### downsampling

In [9]:
# Assuming 'target' is your target column name
target_column = 'target'

# Assuming you have a DataFrame 'train_data' with the dataset

# Separate the dataset into two DataFrames based on the target value
zeros = train_data[train_data[target_column] == 0]
ones = train_data[train_data[target_column] == 1]

# Determine the size of the minority class (the one with fewer samples)
minority_class_size = min(len(zeros), len(ones))

# Sample an equal number of samples from each class
zeros_downsampled = zeros.sample(minority_class_size, random_state=42)
ones_downsampled = ones.sample(minority_class_size, random_state=42)

# Combine the downsampled DataFrames
downsampled_data = pd.concat([zeros_downsampled, ones_downsampled])

# Shuffle the downsampled dataset (optional but recommended)
downsampled_data = downsampled_data.sample(frac=1, random_state=42)

# Your downsampled dataset is in the 'downsampled_data' DataFrame

In [10]:
dfs = ["train_data", "zeros", "ones", "zeros_downsampled", "ones_downsampled"]

In [11]:
for el in dfs:
    if el in locals():
        del el

### splitting

In [12]:
# Split the dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(downsampled_data.iloc[:, :-1], downsampled_data["target"], test_size=0.2, random_state=42)

In [13]:
dfs = ["downsampled_data"]

In [14]:
for el in dfs:
    if el in locals():
        del el

In [15]:
# Explicitly call the garbage collector to free up memory
gc.collect()

108

In [16]:
X_train.shape

(2204590, 190)

In [17]:
X_val.shape

(551148, 190)

In [18]:
y_train.shape

(2204590,)

In [19]:
y_val.shape

(551148,)

In [20]:
correlation_result = calculate_correlation_with_target(X_train, y_train)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [21]:
top_ten_correlations = list(correlation_result[:10].index)
top_ten_correlations

['D_48', 'D_55', 'B_9', 'D_58', 'D_75', 'D_44', 'B_7', 'B_23', 'B_16', 'B_3']

In [22]:
print(f"Step 1: Excluding 'customer_ID', 'S_2' and creating a DictVectorizer")

# Exclude 'customer_ID' column and create a DictVectorizer
dict_vectorizer = DictVectorizer(sparse=False)

X_train_dict = X_train[top_ten_correlations].to_dict(orient='records')
X_val_dict = X_val[top_ten_correlations].to_dict(orient='records')

X_train_encoded = dict_vectorizer.fit_transform(X_train_dict)
X_val_encoded = dict_vectorizer.transform(X_val_dict)

Step 1: Excluding 'customer_ID', 'S_2' and creating a DictVectorizer


In [23]:
len(dict_vectorizer.get_feature_names_out())

10

In [24]:
dict_vectorizer.get_feature_names_out()

array(['B_16', 'B_23', 'B_3', 'B_7', 'B_9', 'D_44', 'D_48', 'D_55',
       'D_58', 'D_75'], dtype=object)

In [25]:
X_train_encoded[0]

array([0.83530331, 0.14915705, 0.22641162, 0.18482181, 0.05730877,
       0.00539621, 0.08654039, 0.08061728, 0.43390864, 0.33806032])

In [26]:
gc.collect()

72

In [27]:
classifiers = [
    ('LogisticRegression', LogisticRegression(n_jobs=-1), {
        'classifier__C': [0.1, 1.0],
        'classifier__penalty': ['l1', 'l2'],
    }),
    ('RandomForest', RandomForestClassifier(n_jobs=-1), {
        'classifier__n_estimators': [200],  # [100, 200],
        'classifier__max_depth': [6],  # [None, 6],
        'classifier__min_samples_split': [5],  # [2, 5], Optimized parameter
        'classifier__min_samples_leaf': [2]  # [1, 2] Optimized parameter
    }),
    ('XGBoost', xgb.XGBClassifier(n_jobs=-1), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [3, 5],
        'classifier__min_child_weight': [1, 2]  # Optimized parameter
    })
]

In [28]:


# Set the log file name and log level
log_file = 'training_log.log'
log_level = logging.INFO  # You can change the log level as needed (e.g., DEBUG, INFO, WARNING, ERROR)

# Configure logging
logging.basicConfig(filename=log_file, level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# Create a StringIO object to capture stdout
stdout_capture = StringIO()
sys.stdout = stdout_capture

results = []

for i, (name, classifier, params) in enumerate(classifiers, start=1):
    logging.info(f"Step {i + 1}: Training {name} classifier")

    # Create an RFE model and a pipeline
    rfe = RFE(estimator=classifier, n_features_to_select=7)  # select a number of features to be left

    # Add a step to impute missing values with the median
    imputer = SimpleImputer(strategy='median')  # You can choose a different strategy

    pipeline = Pipeline([
        ('imputer', imputer),  # Add the imputation step
        ('feature_selection', rfe),
        ('classifier', classifier)
    ])

    logging.info(f"Step {i + 2}: Performing hyperparameter tuning")

    # Perform hyperparameter tuning
    grid = GridSearchCV(pipeline, param_grid=params, cv=3, n_jobs=-1, verbose=3)
    grid.fit(X_train_encoded, y_train)

    # Log captured output (info) to the log file
    captured_output = stdout_capture.getvalue()
    for line in captured_output.splitlines():
        logging.info(line)

    # Clear the captured output
    stdout_capture.truncate(0)
    stdout_capture.seek(0)

    # Save the best model as a .bin file
    best_model = grid.best_estimator_
    model_filename = f"{name}_best_model.bin"
    joblib.dump(best_model, model_filename)

    logging.info(f"Step {i + 3}: Saving the best model for {name} as {model_filename}")

    # Explicit garbage collection
    gc.collect()

    logging.info(f"Step {i + 4}: Evaluating the best model on the validation set using F1 score")

    # Evaluate the best model on the validation set using F1 score
    y_pred = grid.predict(X_val_encoded)
    f1 = f1_score(y_val, y_pred)

    results.append({'name': name, 'f1': f1, 'best_params': grid.best_params_})

    # Explicit garbage collection
    gc.collect()

# Save the results to a JSON file
with open('grid_search_results.json', 'w') as file:
    json.dump(results, file, indent=4)

# Restore sys.stdout to its original value
sys.stdout = sys.__stdout__

# Print the logs to the terminal
log_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
log_handler = logging.StreamHandler()
log_handler.setFormatter(log_formatter)
root_logger = logging.getLogger()
root_logger.addHandler(log_handler)

6 fits failed out of a total of 12.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "c:\users\konutech\zoomcamp-capstone-01\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\users\konutech\zoomcamp-capstone-01\venv\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\users\konutech\zoomcamp-capstone-01\venv\lib\site-packages\sklearn\pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\users\konutech\zoomcamp-capstone-01\venv\lib\site-packages

KeyboardInterrupt: 