In [232]:
import pandas as pd
import numpy as np
import seaborn as sns

In [233]:
df = pd.read_csv('bank-full.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [234]:
keep = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign',
        'pdays', 'previous', 'poutcome', 'y']
df = df[keep]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [235]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [236]:
# Question 1. What is the most frequent observation (mode) for the column 'education'?
df['education'].mode()

0    secondary
Name: education, dtype: object

In [237]:
# Question 2. Create the correlation matrix for numerical features.
numerical = df.select_dtypes(include=[np.number])
correlation = numerical.corr()
correlation

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [238]:
# Target encoding
df['y'] = df['y'].apply(lambda x: 1 if x == 'yes' else 0)
df['y'].value_counts()

y
0    39922
1     5289
Name: count, dtype: int64

In [239]:
# Split the data

from sklearn.model_selection import train_test_split

seed = 42

y = df['y']
X = df.drop('y', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed)

In [240]:
# Question 3. Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training dataset only. Round the result to 2 decimal places.
from sklearn.metrics import mutual_info_score

categorical = X_train.select_dtypes(include=[object]).columns

mi_score_dict = {}

for var in categorical:
    mi_score = mutual_info_score(X_train[var], y_train)
    mi_score_dict[var] = np.round(mi_score, 2)

max(mi_score_dict, key=mi_score_dict.get)

'month'

In [241]:
# Question 4. 
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [242]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
categorical = categorical.tolist()
numerical = numerical.columns.tolist()

X_train_dict = X_train[categorical + numerical].to_dict(orient='records')
X_train_transformed = dv.fit_transform(X_train_dict)

X_val_dict = X_val[categorical + numerical].to_dict(orient='records')
X_val_transformed = dv.transform(X_val_dict)

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_transformed, y_train)

y_pred = model.predict_proba(X_val_transformed)[:, 1]
avg_decision = (y_pred >= 1)

accuracy = accuracy_score(y_val, avg_decision)

rounded_accuracy = round(accuracy, 1)
rounded_accuracy

0.9

In [243]:
# Question 5. Find least useful feature using the feature elimination technique.

features = ['age', 'balance', 'marital', 'previous']
scores = []

total_model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

total_model.fit(X_train[features], y_train)
y_pred = total_model.predict_proba(X_val_transformed[features])[:, 1]
total_score = accuracy_score(y_val, y_pred >= 0.5)
scores.append(total_score)

for feature in features:
    features.remove(feature)
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_transformed[features], y_train)
    y_pred = model.predict_proba(X_val_transformed[features])[:, 1]
    score = accuracy_score(y_val, y_pred >= 0.5)
    scores.append(score)
    features.append(feature)

min_score = min(scores)
print(f'Least useful feature: {features[scores.index(min_score)]}')

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices