https://www.kaggle.com/competitions/amex-default-prediction/data

In [1]:
import gc

In [2]:
import os
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import VarianceThreshold, RFE
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import SimpleImputer
# from imblearn.under_sampling import RandomUnderSampler

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
def calculate_correlation_with_target(features_df, target_series, variance_threshold=0.0):
    """
    Calculate the correlation of numeric columns in a features DataFrame with a target Series
    and perform Variance Threshold feature selection.
    
    Parameters:
    features_df (pd.DataFrame): The features DataFrame.
    target_series (pd.Series): The target Series.
    variance_threshold (float): Variance threshold for feature selection. Features with variance
        below this threshold will be removed. Default is 0.0 (no threshold).

    Returns:
    pd.Series: A Series containing the correlation coefficients sorted by absolute values.
    """
    # Select only numeric columns from the features DataFrame
    numeric_features = features_df.select_dtypes(include=['number'])
    
    # Calculate the correlation and sort the result by absolute values in descending order
    correlation_series = numeric_features.corrwith(target_series)
    absolute_correlation_series = correlation_series.abs()
    
    # Apply Variance Threshold to filter features
    if variance_threshold > 0.0:
        selector = VarianceThreshold(threshold=variance_threshold)
        numeric_features = selector.fit_transform(numeric_features)
        # Update correlation series to match the selected features
        correlation_series = pd.Series(selector.inverse_transform(correlation_series.values.reshape(1, -1))[0], index=numeric_features.columns)
    
    # Sort the DataFrame by absolute values
    correlation_series = correlation_series.sort_values(ascending=False)
    
    return correlation_series

In [5]:
# Define the directory path
data_dir = os.path.join('C:\\', 'Users', 'KonuTech', 'zoomcamp-capstone-01', 'data')
data_dir

'C:\\Users\\KonuTech\\zoomcamp-capstone-01\\data'

In [6]:
train_data_parquet_file = 'train_data.parquet'

In [7]:
# Load the training data
train_data = pd.read_parquet(os.path.join(data_dir, train_data_parquet_file))

In [8]:
train_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Data columns (total 191 columns):
 #    Column       Dtype  
---   ------       -----  
 0    customer_ID  object 
 1    S_2          object 
 2    P_2          float64
 3    D_39         float64
 4    B_1          float64
 5    B_2          float64
 6    R_1          float64
 7    S_3          float64
 8    D_41         float64
 9    B_3          float64
 10   D_42         float64
 11   D_43         float64
 12   D_44         float64
 13   B_4          float64
 14   D_45         float64
 15   B_5          float64
 16   R_2          float64
 17   D_46         float64
 18   D_47         float64
 19   D_48         float64
 20   D_49         float64
 21   B_6          float64
 22   B_7          float64
 23   B_8          float64
 24   D_50         float64
 25   D_51         float64
 26   B_9          float64
 27   R_3          float64
 28   D_52         float64
 29   P_3          float64
 30   B_10         flo

In [9]:
# Split the dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data.iloc[:, :-1], train_data["target"], test_size=0.2, random_state=42)

In [10]:
# Delete the DataFrame variable to free up memory
del train_data

In [11]:
# Explicitly call the garbage collector to free up memory
gc.collect()

18

In [12]:
X_train.shape

(4425160, 190)

In [13]:
X_val.shape

(1106291, 190)

In [14]:
y_train.shape

(4425160,)

In [15]:
y_val.shape

(1106291,)

In [16]:
correlation_result = calculate_correlation_with_target(X_train, y_train)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [17]:
correlation_result

D_48    0.549648
B_9     0.475682
D_44    0.470569
D_75    0.457650
D_55    0.457010
          ...   
B_33   -0.453378
B_2    -0.483353
B_18   -0.487650
P_2    -0.610930
D_87         NaN
Length: 186, dtype: float64

In [29]:
top_ten_correlations = list(correlation_result[:10].index)
top_ten_correlations

['D_48', 'B_9', 'D_44', 'D_75', 'D_55', 'D_58', 'B_7', 'B_3', 'B_23', 'D_74']

In [30]:
print(f"Step 1: Excluding 'customer_ID', 'S_2' and creating a DictVectorizer")

# Exclude 'customer_ID' column and create a DictVectorizer
dict_vectorizer = DictVectorizer(sparse=False)

# X_train_dict = X_train.drop(columns=['customer_ID', "S_2"]).to_dict(orient='records')
# X_val_dict = X_val.drop(columns=['customer_ID', "S_2"]).to_dict(orient='records')

X_train_dict = X_train[top_ten_correlations].to_dict(orient='records')
X_val_dict = X_train[top_ten_correlations].to_dict(orient='records')

# X_train_dict = X_train.drop(columns=['customer_ID', "S_2"]).fillna(0).to_dict(orient='records')
# X_val_dict = X_train.drop(columns=['customer_ID', "S_2"]).fillna(0).to_dict(orient='records')

X_train_encoded = dict_vectorizer.fit_transform(X_train_dict)
X_val_encoded = dict_vectorizer.transform(X_val_dict)

Step 1: Excluding 'customer_ID', 'S_2' and creating a DictVectorizer


In [31]:
len(dict_vectorizer.get_feature_names_out())

10

In [32]:
dict_vectorizer.get_feature_names_out()

array(['B_23', 'B_3', 'B_7', 'B_9', 'D_44', 'D_48', 'D_55', 'D_58',
       'D_74', 'D_75'], dtype=object)

In [33]:
X_train_encoded[0]

array([0.05327476, 0.01466625, 0.07976112, 0.00281534, 0.00244407,
       0.07434476, 0.0707013 , 0.25564853, 0.15175034, 0.13384999])

In [34]:
gc.collect()

297

In [35]:
# Define a list of classifiers to try
classifiers = [
    ('DecisionTree', DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 10]
    }),
    ('RandomForest', RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10]
    }),
    ('SVM', SVC(), {
        'classifier__C': [0.1, 1.0],
        'classifier__kernel': ['linear', 'rbf']
    }),
    ('LogisticRegression', LogisticRegression(), {
        'classifier__C': [0.1, 1.0],
        'classifier__penalty': ['l1', 'l2']
    }),
    ('GradientBoosting', GradientBoostingClassifier(), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [3, 5]
    }),
    ('NaiveBayes', GaussianNB(), {}),
    ('XGBoost', xgb.XGBClassifier(), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [3, 5]
    }),
]

In [None]:
results = []

for i, (name, classifier, params) in enumerate(classifiers, start=1):
    print(f"Step {i + 1}: Training {name} classifier")

    # Create an RFE model and a pipeline
    rfe = RFE(estimator=classifier, n_features_to_select=5)

    # Add a step to impute missing values with the median
    imputer = SimpleImputer(strategy='median')  # You can choose a different strategy

    pipeline = Pipeline([
        ('imputer', imputer),  # Add the imputation step
        ('feature_selection', rfe),
        ('classifier', classifier)
    ])

    print(f"Step {i + 2}: Performing hyperparameter tuning")

    # Perform hyperparameter tuning
    grid = GridSearchCV(pipeline, param_grid=params, cv=5, n_jobs=-1)
    grid.fit(X_train_encoded, y_train)

    # Explicit garbage collection
    gc.collect()

    print(f"Step {i + 3}: Evaluating the best model on the validation set using F1 score")

    # Evaluate the best model on the validation set using F1 score
    y_pred = grid.predict(X_val_encoded)
    f1 = f1_score(y_val, y_pred)

    results.append((name, f1, grid.best_params_))

    # Explicit garbage collection
    gc.collect()

# Print the results
for name, f1, best_params in results:
    print(f'{name}: F1 Score={f1:.2f}, Best Params={best_params}')

Step 2: Training DecisionTree classifier
Step 3: Performing hyperparameter tuning


In [None]:
# results = []

# for i, (name, classifier, params) in enumerate(classifiers, start=1):
#     print(f"Step {i + 1}: Training {name} classifier")

#     # Create an RFE model and a pipeline
#     rfe = RFE(estimator=classifier, n_features_to_select=5)

#     # Add a step to impute missing values with the median
#     imputer = SimpleImputer(strategy='median')  # You can choose a different strategy

#     pipeline = Pipeline([
#         ('imputer', imputer),  # Add the imputation step
#         ('feature_selection', rfe),
#         ('classifier', classifier)
#     ])

#     print(f"Step {i + 2}: Performing hyperparameter tuning")

#     # Perform hyperparameter tuning
#     grid = GridSearchCV(pipeline, param_grid=params, cv=5, n_jobs=-1)
#     grid.fit(X_train_encoded, y_train)

#     print(f"Step {i + 3}: Evaluating the best model on the validation set using F1 score")

#     # Evaluate the best model on the validation set using F1 score
#     y_pred = grid.predict(X_val_encoded)
#     f1 = f1_score(y_val, y_pred)

#     results.append((name, f1, grid.best_params_))

# # Print the results
# for name, f1, best_params in results:
#     print(f'{name}: F1 Score={f1:.2f}, Best Params={best_params}')
