https://www.kaggle.com/competitions/amex-default-prediction/data

In [18]:
import logging
import warnings
import sys
import json
import gc
import joblib  # Import joblib for model saving
from io import StringIO
from datetime import datetime
import shutil
import pickle

In [2]:
import os
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import SimpleImputer
# from imblearn.under_sampling import RandomUnderSampler

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
def calculate_correlation_with_target(features_df, target_series, variance_threshold=0.0):
    """
    Calculate the correlation of numeric columns in a features DataFrame with a target Series
    and perform Variance Threshold feature selection.
    
    Parameters:
    features_df (pd.DataFrame): The features DataFrame.
    target_series (pd.Series): The target Series.
    variance_threshold (float): Variance threshold for feature selection. Features with variance
        below this threshold will be removed. Default is 0.0 (no threshold).

    Returns:
    pd.Series: A Series containing the correlation coefficients sorted by absolute values.
    """
    # Select only numeric columns from the features DataFrame
    numeric_features = features_df.select_dtypes(include=['number'])
    
    # Calculate the correlation and sort the result by absolute values in descending order
    correlation_series = numeric_features.corrwith(target_series)
    absolute_correlation_series = correlation_series.abs()
    
    # Apply Variance Threshold to filter features
    if variance_threshold > 0.0:
        selector = VarianceThreshold(threshold=variance_threshold)
        numeric_features = selector.fit_transform(numeric_features)
        # Update correlation series to match the selected features
        correlation_series = pd.Series(selector.inverse_transform(correlation_series.values.reshape(1, -1))[0], index=numeric_features.columns)
    
    # Sort the DataFrame by absolute values
    correlation_series = correlation_series.sort_values(ascending=False)
    
    return correlation_series

In [5]:
# Define the directory path
data_dir = os.path.join('C:\\', 'Users', 'KonuTech', 'zoomcamp-capstone-01', 'data')
data_dir

'C:\\Users\\KonuTech\\zoomcamp-capstone-01\\data'

In [6]:
train_data_parquet_file = 'train_data_downsampled.parquet'

In [7]:
# Load the training data
train_data = pd.read_parquet(os.path.join(data_dir, train_data_parquet_file))

In [8]:
train_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2755738 entries, 541332 to 2614482
Data columns (total 191 columns):
 #    Column       Dtype  
---   ------       -----  
 0    customer_ID  object 
 1    S_2          object 
 2    P_2          float64
 3    D_39         float64
 4    B_1          float64
 5    B_2          float64
 6    R_1          float64
 7    S_3          float64
 8    D_41         float64
 9    B_3          float64
 10   D_42         float64
 11   D_43         float64
 12   D_44         float64
 13   B_4          float64
 14   D_45         float64
 15   B_5          float64
 16   R_2          float64
 17   D_46         float64
 18   D_47         float64
 19   D_48         float64
 20   D_49         float64
 21   B_6          float64
 22   B_7          float64
 23   B_8          float64
 24   D_50         float64
 25   D_51         float64
 26   B_9          float64
 27   R_3          float64
 28   D_52         float64
 29   P_3          float64
 30   B_10         flo

In [9]:
categorical_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
categorical_features

['B_30',
 'B_38',
 'D_114',
 'D_116',
 'D_117',
 'D_120',
 'D_126',
 'D_63',
 'D_64',
 'D_66',
 'D_68']

In [10]:
distinct_value_counts = {}  # A dictionary to store value counts for each feature

for feature in categorical_features:
    value_counts = train_data[feature].value_counts(dropna=False)  # Count the occurrences of each distinct value
    distinct_value_counts[feature] = value_counts

print(distinct_value_counts)

{'B_30': B_30
0.0    2140115
1.0     573964
2.0      40548
NaN       1111
Name: count, dtype: int64, 'B_38': B_38
2.0    730880
3.0    691938
1.0    489769
5.0    313851
4.0    235760
7.0    172285
6.0    120144
NaN      1111
Name: count, dtype: int64, 'D_114': D_114
1.0    1498634
0.0    1147059
NaN     110045
Name: count, dtype: int64, 'D_116': D_116
0.0    2640786
NaN     110045
1.0       4907
Name: count, dtype: int64, 'D_117': D_117
-1.0    748665
 3.0    597479
 4.0    510116
 2.0    373085
 5.0    193720
 6.0    145173
 NaN    110045
 1.0     77455
Name: count, dtype: int64, 'D_120': D_120
0.0    2241969
1.0     403724
NaN     110045
Name: count, dtype: int64, 'D_126': D_126
 1.0    2077553
 0.0     480052
-1.0     122996
 NaN      75137
Name: count, dtype: int64, 'D_63': D_63
CO    2090167
CR     414108
CL     230676
XZ      11811
XM       5215
XL       3761
Name: count, dtype: int64, 'D_64': D_64
O       1296527
U        854126
R        456102
None     131159
-1        17824
N

In [11]:
result = {}  # A dictionary to store the percentage of "1" in "target" for each feature

for feature in categorical_features:
    # Calculate the percentage of "1" in "target" for the current feature
    percentages = train_data.groupby(feature)['target'].mean() * 100  # Multiply by 100 to get percentages
    
    result[feature] = percentages

# Print or use the result dictionary as needed
print(result)

{'B_30': B_30
0.0    40.368999
1.0    83.560816
2.0    82.862287
Name: target, dtype: float64, 'B_38': B_38
1.0    32.038165
2.0    16.918236
3.0    59.615312
4.0    87.528843
5.0    79.294315
6.0    82.934645
7.0    75.003628
Name: target, dtype: float64, 'D_114': D_114
0.0    61.480273
1.0    39.722774
Name: target, dtype: float64, 'D_116': D_116
0.0    49.093338
1.0    82.820461
Name: target, dtype: float64, 'D_117': D_117
-1.0    53.148471
 1.0    70.906978
 2.0    60.907568
 3.0    52.626117
 4.0    38.772750
 5.0    32.027669
 6.0    31.818589
Name: target, dtype: float64, 'D_120': D_120
0.0    44.874528
1.0    72.931260
Name: target, dtype: float64, 'D_126': D_126
-1.0    44.446974
 0.0    57.535225
 1.0    47.764221
Name: target, dtype: float64, 'D_63': D_63
CL    55.283168
CO    51.800024
CR    38.153573
XL    58.787557
XM    48.053691
XZ    41.681483
Name: target, dtype: float64, 'D_64': D_64
-1    46.072711
O     38.030832
R     58.065740
U     61.218368
Name: target, dtype:

### Impute median

In [12]:
# Specify columns to exclude from imputation
exclude_columns = ['customer_ID', 'S_2', 'target', 'D_63', 'D_64']

In [13]:
# Create a SimpleImputer to impute with the median
imputer = SimpleImputer(strategy='median')

In [14]:
# Separate the DataFrame into the columns to impute and those to exclude
columns_to_impute = train_data.columns.difference(exclude_columns)
columns_to_impute

Index(['B_1', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17',
       'B_18',
       ...
       'S_24', 'S_25', 'S_26', 'S_27', 'S_3', 'S_5', 'S_6', 'S_7', 'S_8',
       'S_9'],
      dtype='object', length=186)

In [15]:
# Fit the imputer to the data and transform the specified columns
train_data[columns_to_impute] = imputer.fit_transform(train_data[columns_to_impute])

In [17]:
distinct_value_counts = {}  # A dictionary to store value counts for each feature

for feature in categorical_features:
    value_counts = train_data[feature].value_counts(dropna=False)  # Count the occurrences of each distinct value
    distinct_value_counts[feature] = value_counts

print(distinct_value_counts)

{'B_30': B_30
0.0    2141226
1.0     573964
2.0      40548
Name: count, dtype: int64, 'B_38': B_38
2.0    730880
3.0    693049
1.0    489769
5.0    313851
4.0    235760
7.0    172285
6.0    120144
Name: count, dtype: int64, 'D_114': D_114
1.0    1608679
0.0    1147059
Name: count, dtype: int64, 'D_116': D_116
0.0    2750831
1.0       4907
Name: count, dtype: int64, 'D_117': D_117
-1.0    748665
 3.0    707524
 4.0    510116
 2.0    373085
 5.0    193720
 6.0    145173
 1.0     77455
Name: count, dtype: int64, 'D_120': D_120
0.0    2352014
1.0     403724
Name: count, dtype: int64, 'D_126': D_126
 1.0    2152690
 0.0     480052
-1.0     122996
Name: count, dtype: int64, 'D_63': D_63
CO    2090167
CR     414108
CL     230676
XZ      11811
XM       5215
XL       3761
Name: count, dtype: int64, 'D_64': D_64
O       1296527
U        854126
R        456102
None     131159
-1        17824
Name: count, dtype: int64, 'D_66': D_66
1.0    2751995
0.0       3743
Name: count, dtype: int64, 'D_68': D

In [19]:
# Save the fitted DictVectorizer to a file using pickle
with open('imputer.pkl', 'wb') as file:
    pickle.dump(imputer, file)

In [20]:
# Convert the specified columns to strings
train_data[categorical_features] = train_data[categorical_features].astype(str)

In [21]:
train_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 2755738 entries, 541332 to 2614482
Data columns (total 191 columns):
 #    Column       Dtype  
---   ------       -----  
 0    customer_ID  object 
 1    S_2          object 
 2    P_2          float64
 3    D_39         float64
 4    B_1          float64
 5    B_2          float64
 6    R_1          float64
 7    S_3          float64
 8    D_41         float64
 9    B_3          float64
 10   D_42         float64
 11   D_43         float64
 12   D_44         float64
 13   B_4          float64
 14   D_45         float64
 15   B_5          float64
 16   R_2          float64
 17   D_46         float64
 18   D_47         float64
 19   D_48         float64
 20   D_49         float64
 21   B_6          float64
 22   B_7          float64
 23   B_8          float64
 24   D_50         float64
 25   D_51         float64
 26   B_9          float64
 27   R_3          float64
 28   D_52         float64
 29   P_3          float64
 30   B_10         flo

### splitting

In [22]:
# Split the dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data.iloc[:, :-1], train_data["target"], test_size=0.2, random_state=42)

In [23]:
# Explicitly call the garbage collector to free up memory
gc.collect()

497

In [24]:
X_train.shape

(2204590, 190)

In [25]:
X_val.shape

(551148, 190)

In [26]:
y_train.shape

(2204590,)

In [27]:
y_val.shape

(551148,)

In [28]:
correlation_result = calculate_correlation_with_target(X_train, y_train)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [29]:
top_ten_correlations = list(correlation_result[:10].index)
top_ten_correlations

['D_48', 'D_55', 'B_9', 'D_58', 'D_75', 'B_7', 'B_23', 'B_16', 'D_44', 'B_3']

In [30]:
features = top_ten_correlations + categorical_features
features

['D_48',
 'D_55',
 'B_9',
 'D_58',
 'D_75',
 'B_7',
 'B_23',
 'B_16',
 'D_44',
 'B_3',
 'B_30',
 'B_38',
 'D_114',
 'D_116',
 'D_117',
 'D_120',
 'D_126',
 'D_63',
 'D_64',
 'D_66',
 'D_68']

### Fitting DictVectorizer

In [31]:
print(f"Step 1: Excluding 'customer_ID', 'S_2' and creating a DictVectorizer")

# Exclude 'customer_ID' column and create a DictVectorizer
dict_vectorizer = DictVectorizer(sparse=False)

X_train_dict = X_train[features].to_dict(orient='records')
X_val_dict = X_val[features].to_dict(orient='records')

X_train_encoded = dict_vectorizer.fit_transform(X_train_dict)
X_val_encoded = dict_vectorizer.transform(X_val_dict)

Step 1: Excluding 'customer_ID', 'S_2' and creating a DictVectorizer


In [32]:
len(dict_vectorizer.get_feature_names_out())

56

In [33]:
dict_vectorizer.get_feature_names_out()

array(['B_16', 'B_23', 'B_3', 'B_30=0.0', 'B_30=1.0', 'B_30=2.0',
       'B_38=1.0', 'B_38=2.0', 'B_38=3.0', 'B_38=4.0', 'B_38=5.0',
       'B_38=6.0', 'B_38=7.0', 'B_7', 'B_9', 'D_114=0.0', 'D_114=1.0',
       'D_116=0.0', 'D_116=1.0', 'D_117=-1.0', 'D_117=1.0', 'D_117=2.0',
       'D_117=3.0', 'D_117=4.0', 'D_117=5.0', 'D_117=6.0', 'D_120=0.0',
       'D_120=1.0', 'D_126=-1.0', 'D_126=0.0', 'D_126=1.0', 'D_44',
       'D_48', 'D_55', 'D_58', 'D_63=CL', 'D_63=CO', 'D_63=CR', 'D_63=XL',
       'D_63=XM', 'D_63=XZ', 'D_64=-1', 'D_64=None', 'D_64=O', 'D_64=R',
       'D_64=U', 'D_66=0.0', 'D_66=1.0', 'D_68=0.0', 'D_68=1.0',
       'D_68=2.0', 'D_68=3.0', 'D_68=4.0', 'D_68=5.0', 'D_68=6.0', 'D_75'],
      dtype=object)

In [34]:
X_train_encoded[0]

array([8.11689337e-03, 8.82114270e-03, 1.34561409e-02, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 8.47173325e-03, 4.46140447e-03, 0.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 5.29108439e-03,
       5.10770290e-02, 1.64400025e-01, 7.33613367e-04, 0.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.07962957e-03])

### Pickling fitted DictVectorizer()

In [35]:
# Save the fitted DictVectorizer to a file using pickle
with open('dict_vectorizer.pkl', 'wb') as file:
    pickle.dump(dict_vectorizer, file)

In [36]:
gc.collect()

90

### Training

In [37]:
classifiers = [
    ('LogisticRegression', LogisticRegression(n_jobs=-1), {
        # 'classifier__max_iter': [1000],
        'classifier__solver': ['saga'],
        'classifier__C': [0.1, 1.0],
        'classifier__penalty': ['l1', 'l2'],
    }),
    ('XGBoost', xgb.XGBClassifier(eval_metric='auc', colsample_bytree=0.8, n_jobs=-1), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [5, 10],
        'classifier__min_child_weight': [1, 2]  # Optimized parameter
    }),
    # ('RandomForest', RandomForestClassifier(n_jobs=-1), {
    #     'classifier__n_estimators': [200],  # [100, 200],
    #     'classifier__max_depth': [6],  # [None, 6],
    #     'classifier__min_samples_split': [5],  # [2, 5],
    #     'classifier__min_samples_leaf': [2]  # [1, 2]
    # })
]

In [None]:
import logging
import json
import gc
import joblib
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler  # Import the MinMaxScaler
from datetime import datetime

# Set the log file name and log level
log_file = 'training_log.log'
log_level = logging.INFO  # You can change the log level as needed (e.g., DEBUG, INFO, WARNING, ERROR)

# Generate a timestamp
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Configure logging
logging.basicConfig(filename=log_file, level=log_level, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

results = []

# Get the total number of logical processors
total_processors = os.cpu_count()

# Calculate the number of processors to use (80% of the available)
desired_processors = int(0.7 * total_processors)

for i, (name, classifier, params) in enumerate(classifiers, start=1):
    logging.info(f"Step {i}: Training {name} classifier")

    # Create an RFE model and a pipeline
    rfe = RFE(estimator=classifier, n_features_to_select=10)  # specify a number of features to be selected by RFE

    # Add a step to impute missing values with the median
    # imputer = SimpleImputer(strategy='median')  # You can choose a different strategy

    # Add Min-Max scaling to the pipeline
    scaler = MinMaxScaler()

    pipeline = Pipeline([
        # ('imputer', imputer),  # Add the imputation step
        ('scaler', scaler),  # Add the scaling step
        ('feature_selection', rfe),
        ('classifier', classifier)
    ])

    logging.info(f"Step {i + 1}: Performing hyperparameter tuning")

    # Perform hyperparameter tuning
    grid = GridSearchCV(pipeline, param_grid=params, cv=3, n_jobs=desired_processors, verbose=3)  # n_jobs=-1
    grid.fit(X_train_encoded, y_train)

    # Access the feature names used in the best_estimator_
    selected_feature_indices = grid.best_estimator_.named_steps['feature_selection'].get_support(indices=True)
    logging.info(f"Selected Indices of Features for {name}: {selected_feature_indices}")

    # Get selected features labels
    # feature_names = X_train_encoded.columns
    # Get the feature names from the DictVectorizer
    feature_names = dict_vectorizer.get_feature_names_out()

    # Get the selected feature names
    selected_features = [feature_names[i] for i in selected_feature_indices]

    logging.info(f"Selected Features for {name}: {selected_features}")

    # Log the output of grid.best_estimator_
    logging.info(f"Best Estimator for {name}: {grid.best_estimator_}")

    # Save the best model as a .bin file with the timestamp
    best_model = grid.best_estimator_
    model_filename = f"{name}_{timestamp}.bin"  # Include the timestamp in the filename
    joblib.dump(best_model, model_filename)

    logging.info(f"Step {i + 2}: Saving the best model for {name} as {model_filename}")

    # Explicit garbage collection
    gc.collect()

    logging.info(f"Step {i + 3}: Evaluating the best model on the validation set using Gini coefficient")

    # Evaluate the best model on the validation set using Gini coefficient
    y_prob = grid.predict_proba(X_val_encoded)  # Use predict_proba to get probabilities
    gini = 2 * roc_auc_score(y_val, y_prob[:, 1]) - 1  # Calculate the Gini coefficient

    results.append({'name': name, 'gini': gini, 'best_params': grid.best_params_})

# Create the JSON filename with the timestamp
json_file_name = f'grid_search_results_{timestamp}.json'

# Save the results to the JSON file
with open(json_file_name, 'w') as file:
    json.dump(results, file, indent=4)

# Print the logs to the terminal
log_formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
log_handler = logging.StreamHandler()
log_handler.setFormatter(log_formatter)
root_logger = logging.getLogger()
root_logger.addHandler(log_handler)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [None]:
results

In [None]:
# Get the current working directory
current_dir = os.getcwd()
current_dir

In [None]:
destination_dir = os.path.join('C:\\', 'Users', 'KonuTech', 'zoomcamp-capstone-01', 'models')
destination_dir

In [None]:
# Specify the file extension to filter (e.g., ".bin")
file_extension = ".bin"

In [None]:
# List files in the current working directory
source_files = os.listdir(current_dir)
source_files

In [None]:
# Move files with the specified extension
for file in source_files:
    if file.endswith(file_extension):
        # Construct the source and destination paths
        source_path = os.path.join(current_dir, file)
        
        # Check if the file exists before moving
        if os.path.exists(source_path):
            destination_path = os.path.join(destination_dir, file)
            shutil.move(source_path, destination_path)
        else:
            print(f"File not found: {file}")

In [None]:
# Verify the move operation
destination_files = os.listdir(destination_dir)
print(f'Moved files with extension {file_extension} to destination directory: {destination_files}')

In [None]:
os.listdir(destination_dir)