In [1]:
!pip install gdown



In [2]:
import gdown

# Google Drive file ID
file_id = "1_RGUQe397BeGEmCY9BEHX081lAleMnmw"
# Construct the download URL
url = f"https://drive.google.com/uc?id={file_id}"
# Output file path
output = "bank-full.csv"

# Download the file
gdown.download(url, output, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1_RGUQe397BeGEmCY9BEHX081lAleMnmw
To: C:\Users\T-bao\bank-full.csv
100%|█████████████████████████████████████████████████████████████████████████████| 4.61M/4.61M [00:00<00:00, 5.36MB/s]


'bank-full.csv'

In [3]:
import pandas as pd

# Load the CSV file with semicolon as the delimiter
df = pd.read_csv("bank-full.csv", delimiter=';')

# Display the first few rows
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
columns = [
    'age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 
    'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y'
]

df = df[columns]

In [5]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)  # 0.25 * 0.8 = 0.2


In [6]:
# Check unique values in 'y' before mapping
print(df_train['y'].unique())
unique_values = df_train['y'].unique()
if set(unique_values) == {'yes', 'no'}:
    # Perform mapping
    df_train['y'] = df_train['y'].map({'yes': 1, 'no': 0})
else:
    print('y is already mapped to numeric values.')


['no' 'yes']


In [7]:
from sklearn.metrics import roc_auc_score

In [8]:
numerical_vars = ['balance', 'day', 'duration', 'previous']
for var in numerical_vars:
    auc = roc_auc_score(df_train['y'], df_train[var])
    # If AUC < 0.5, invert the variable
    if auc < 0.5:
        auc = roc_auc_score(df_train['y'], -df_train[var])
    print(f'AUC for {var}: {auc:.3f}')

AUC for balance: 0.589
AUC for day: 0.526
AUC for duration: 0.815
AUC for previous: 0.599


In [9]:
from sklearn.feature_extraction import DictVectorizer

# Prepare the data
df_train['y'] = df_train['y'].map({'yes': 1, 'no': 0})
df_val['y'] = df_val['y'].map({'yes': 1, 'no': 0})

y_train = df_train['y'].values
y_val = df_val['y'].values

# Remove the target variable
del df_train['y']
del df_val['y']

# Convert to dictionaries
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')

# One-hot encoding
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [10]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000)
model.fit(X_train, y_train)

ValueError: Input y contains NaN.

In [11]:
# Check unique values in 'y' column
print(df_train['y'].unique())

KeyError: 'y'

In [13]:
# Check columns in the original DataFrame
print('Columns in df:', df.columns.tolist())
print('Columns after stripping whitespace:', df_train.columns.tolist())


Columns in df: ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
Columns after stripping whitespace: ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']


In [14]:
# Print column names with their representation to see hidden characters
for idx, col in enumerate(df.columns):
    print(f"Column {idx}: '{repr(col)}'")


Column 0: ''age''
Column 1: ''job''
Column 2: ''marital''
Column 3: ''education''
Column 4: ''balance''
Column 5: ''housing''
Column 6: ''contact''
Column 7: ''day''
Column 8: ''month''
Column 9: ''duration''
Column 10: ''campaign''
Column 11: ''pdays''
Column 12: ''previous''
Column 13: ''poutcome''
Column 14: ''y''


In [15]:
# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Identify any columns with empty names
empty_columns = [col for col in df.columns if col == '']

print(f"Columns with empty names: {empty_columns}")


Columns with empty names: []
