In [1]:
!pip install gdown



In [2]:
import gdown

# Google Drive file ID
file_id = "1_RGUQe397BeGEmCY9BEHX081lAleMnmw"
# Construct the download URL
url = f"https://drive.google.com/uc?id={file_id}"
# Output file path
output = "bank-full.csv"

# Download the file
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1_RGUQe397BeGEmCY9BEHX081lAleMnmw
To: C:\Users\T-bao\bank-full.csv
100%|█████████████████████████████████████████████████████████████████████████████| 4.61M/4.61M [00:00<00:00, 5.37MB/s]


'bank-full.csv'

In [3]:
import pandas as pd

# Load the CSV file with semicolon as the delimiter
df = pd.read_csv("bank-full.csv", delimiter=';')

# Display the first few rows
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
columns = [
    'age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 
    'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'
]

df = df[columns]

In [5]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)  # 0.25 * 0.8 = 0.2


In [6]:
# Check unique values in 'y' before mapping
print(df_train['y'].unique())
unique_values = df_train['y'].unique()
if set(unique_values) == {'yes', 'no'}:
    # Perform mapping
    df_train['y'] = df_train['y'].map({'yes': 1, 'no': 0})
else:
    print('y is already mapped to numeric values.')


['no' 'yes']


In [7]:
from sklearn.metrics import roc_auc_score

In [8]:
numerical_vars = ['balance', 'day', 'duration', 'previous']
for var in numerical_vars:
    auc = roc_auc_score(df_train['y'], df_train[var])
    # If AUC < 0.5, invert the variable
    if auc < 0.5:
        auc = roc_auc_score(df_train['y'], -df_train[var])
    print(f'AUC for {var}: {auc:.3f}')

AUC for balance: 0.589
AUC for day: 0.526
AUC for duration: 0.815
AUC for previous: 0.599


In [10]:
# Check if the original 'y' column has any NaN values
print(f"Total rows in the original dataset: {len(df)}")
print(f"Total NaN values in 'y': {df['y'].isna().sum()}")


Total rows in the original dataset: 45211
Total NaN values in 'y': 0


In [9]:
from sklearn.feature_extraction import DictVectorizer
import numpy as np

# Step 1: Ensure the 'y' column is correctly mapped to 1/0 and check for missing values
df_train['y'] = df_train['y'].map({'yes': 1, 'no': 0})
df_val['y'] = df_val['y'].map({'yes': 1, 'no': 0})

# Step 2: Check for any NaN values in 'y' and drop rows with missing target values
if df_train['y'].isna().sum() > 0:
    print(f"Dropping {df_train['y'].isna().sum()} rows where 'y' is NaN in the training set")
    df_train = df_train.dropna(subset=['y'])

if df_val['y'].isna().sum() > 0:
    print(f"Dropping {df_val['y'].isna().sum()} rows where 'y' is NaN in the validation set")
    df_val = df_val.dropna(subset=['y'])

# Step 3: Extract the target variable 'y' after cleaning
y_train = df_train['y'].values
y_val = df_val['y'].values

# Step 4: Remove the target variable from the feature set
df_train = df_train.drop(columns=['y'])
df_val = df_val.drop(columns=['y'])

# Step 5: Convert to dictionaries for DictVectorizer
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')

# Step 6: One-hot encoding with DictVectorizer
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)


Dropping 27126 rows where 'y' is NaN in the training set


ValueError: Sample sequence X is empty.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import numpy as np

# Step 1: Check for NaN values in y and handle missing values in X
# Ensure 'y_train' does not contain NaN values by dropping rows with missing targets
nan_count = np.isnan(y_train).sum()

if nan_count > 0:
    print(f"Dropping {nan_count} rows where the target 'y' is NaN")
    # Drop rows where 'y' is NaN
    X_train_cleaned = X_train[~np.isnan(y_train)]
    y_train_cleaned = y_train[~np.isnan(y_train)]
else:
    X_train_cleaned = X_train
    y_train_cleaned = y_train

# Step 2: If there are missing values in X_train, I might use imputation
imputer = SimpleImputer(strategy='mean')  # or strategy='median',

In [None]:
# Check the total number of rows in the training set before dropping NaN values
total_rows = len(df_train)
print(f"Total rows in the training dataset: {total_rows}")
print(f"Rows with NaN values in 'y': {nan_count}")
