https://www.kaggle.com/competitions/amex-default-prediction/data

In [1]:
import os
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.impute import SimpleImputer
# from imblearn.under_sampling import RandomUnderSampler

In [2]:
# Define the directory path
data_dir = os.path.join('C:\\', 'Users', 'KonuTech', 'zoomcamp-capstone-01', 'data')
data_dir

'C:\\Users\\KonuTech\\zoomcamp-capstone-01\\data'

In [3]:
train_data_parquet_file = 'train_data.parquet'

In [4]:
train_labels_parquet_file = 'train_labels.parquet'

In [5]:
# Load the training labels
train_labels = pd.read_parquet(os.path.join(data_dir, train_labels_parquet_file))

In [6]:
train_labels.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_ID  458913 non-null  object
 1   target       458913 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.0+ MB


In [7]:
# Load the training data
train_data = pd.read_parquet(os.path.join(data_dir, train_data_parquet_file))

In [8]:
train_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Index: 458913 entries, 0 to 5531438
Data columns (total 190 columns):
 #    Column       Dtype  
---   ------       -----  
 0    customer_ID  object 
 1    S_2          object 
 2    P_2          float64
 3    D_39         float64
 4    B_1          float64
 5    B_2          float64
 6    R_1          float64
 7    S_3          float64
 8    D_41         float64
 9    B_3          float64
 10   D_42         float64
 11   D_43         float64
 12   D_44         float64
 13   B_4          float64
 14   D_45         float64
 15   B_5          float64
 16   R_2          float64
 17   D_46         float64
 18   D_47         float64
 19   D_48         float64
 20   D_49         float64
 21   B_6          float64
 22   B_7          float64
 23   B_8          float64
 24   D_50         float64
 25   D_51         float64
 26   B_9          float64
 27   R_3          float64
 28   D_52         float64
 29   P_3          float64
 30   B_10         float64
 

In [30]:
# Define the directory path
data_dir = os.path.join('C:\\', 'Users', 'KonuTech', 'zoomcamp-capstone-01', 'data')
data_dir

'C:\\Users\\KonuTech\\zoomcamp-capstone-01\\data'

In [33]:
# Step 1: Read the CSV file in chunks
csv_file = 'train_data.csv'

In [34]:
# Create a TextFileReader, which is iterable with chunks of 10,000 rows.
train_data = pd.read_csv(os.path.join(data_dir, csv_file))

In [36]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Columns: 190 entries, customer_ID to D_145
dtypes: float64(185), int64(1), object(4)
memory usage: 7.8+ GB


In [9]:
selected_rows = [0, 2, 5, 7]

In [35]:
# Split the dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(train_data.iloc[selected_rows], train_labels.iloc[selected_rows], test_size=0.2, random_state=42)

ValueError: Found input variables with inconsistent numbers of samples: [5531451, 458913]

In [22]:
X_train[["R_1"]]

Unnamed: 0,R_1
3397488,0.008139
4055870,0.505918
4004713,0.002230
973646,0.007517
4131026,0.008254
...,...
3124558,0.009464
4409503,0.008132
1591348,0.000575
1771384,0.007154


In [23]:
X_train.shape

(367130, 190)

In [24]:
print(f"Step 1: Excluding 'customer_ID', 'S_2' and creating a DictVectorizer")

# Exclude 'customer_ID' column and create a DictVectorizer
dict_vectorizer = DictVectorizer(sparse=False)

# X_train_dict = X_train.drop(columns=['customer_ID', "S_2"]).to_dict(orient='records')
# X_val_dict = X_val.drop(columns=['customer_ID', "S_2"]).to_dict(orient='records')

X_train_dict = X_train[["R_1"]].fillna(0).to_dict(orient='records')
X_val_dict = X_train[["R_1"]].fillna(0).to_dict(orient='records')

# X_train_dict = X_train.drop(columns=['customer_ID', "S_2"]).fillna(0).to_dict(orient='records')
# X_val_dict = X_train.drop(columns=['customer_ID', "S_2"]).fillna(0).to_dict(orient='records')

X_train_encoded = dict_vectorizer.fit_transform(X_train_dict)
X_val_encoded = dict_vectorizer.transform(X_val_dict)

Step 1: Excluding 'customer_ID', 'S_2' and creating a DictVectorizer


In [25]:
len(dict_vectorizer.get_feature_names_out())

1

In [26]:
dict_vectorizer.get_feature_names_out()

array(['R_1'], dtype=object)

In [27]:
X_train_encoded[0]

array([0.00813852])

In [28]:
# Define a list of classifiers to try
classifiers = [
    ('DecisionTree', DecisionTreeClassifier(), {
        'classifier__max_depth': [None, 10]
    }),
    ('RandomForest', RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10]
    }),
    ('SVM', SVC(), {
        'classifier__C': [0.1, 1.0],
        'classifier__kernel': ['linear', 'rbf']
    }),
    ('LogisticRegression', LogisticRegression(), {
        'classifier__C': [0.1, 1.0],
        'classifier__penalty': ['l1', 'l2']
    }),
    ('GradientBoosting', GradientBoostingClassifier(), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [3, 5]
    }),
    ('NaiveBayes', GaussianNB(), {}),
    ('XGBoost', xgb.XGBClassifier(), {
        'classifier__n_estimators': [50, 100],
        'classifier__max_depth': [3, 5]
    }),
]

In [29]:
results = []

for i, (name, classifier, params) in enumerate(classifiers, start=1):
    print(f"Step {i + 1}: Training {name} classifier")

    # Create an RFE model and a pipeline
    rfe = RFE(estimator=classifier, n_features_to_select=5)

    # Add a step to impute missing values with the median
    imputer = SimpleImputer(strategy='median')  # You can choose a different strategy

    pipeline = Pipeline([
        ('imputer', imputer),  # Add the imputation step
        ('feature_selection', rfe),
        ('classifier', classifier)
    ])

    print(f"Step {i + 2}: Performing hyperparameter tuning")

    # Perform hyperparameter tuning
    grid = GridSearchCV(pipeline, param_grid=params, cv=5, n_jobs=-1)
    grid.fit(X_train_encoded, y_train)

    print(f"Step {i + 3}: Evaluating the best model on the validation set using F1 score")

    # Evaluate the best model on the validation set using F1 score
    y_pred = grid.predict(X_val_encoded)
    f1 = f1_score(y_val, y_pred)

    results.append((name, f1, grid.best_params_))

# Print the results
for name, f1, best_params in results:
    print(f'{name}: F1 Score={f1:.2f}, Best Params={best_params}')


Step 2: Training DecisionTree classifier
Step 3: Performing hyperparameter tuning


TypeError: '<' not supported between instances of 'int' and 'str'