In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

#!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip -O bank-marketing.zip
#!unzip bank-marketing.zip
#!unzip bank.zip

Archive:  bank.zip
  inflating: bank-full.csv           
  inflating: bank-names.txt          
  inflating: bank.csv                


In [None]:
df = pd.read_csv('bank-full.csv', sep=';')

In [None]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
default,no,no,no,no,no
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
loan,no,no,yes,no,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5


In [None]:
df.dtypes

Unnamed: 0,0
age,int64
job,object
marital,object
education,object
default,object
balance,int64
housing,object
loan,object
contact,object
day,int64


In [None]:
selected_columns = ['age', 'job', 'marital', 'education', 'balance', 'housing',
                   'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
                   'previous', 'poutcome', 'y']

df = df[selected_columns]
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5
month,may,may,may,may,may
duration,261,151,76,92,198


In [None]:
df.dtypes

Unnamed: 0,0
age,int64
job,object
marital,object
education,object
balance,int64
housing,object
contact,object
day,int64
month,object
duration,int64


In [None]:
# вопрос 1: Самое частое значение для столбца education

education_mode = df['education'].mode()[0]
print(f"Самое частое значение в столбце education: {education_mode}")

Самое частое значение в столбце education: secondary


In [None]:
# вопрос 2: Корреляционная матрица

numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
correlation_matrix = df[numerical_columns].corr()

print("Корреляционная матрица:")
print(correlation_matrix)

# Находим пару с наибольшей корреляцией (исключая диагональные элементы)
corr_values = correlation_matrix.unstack()
corr_values = corr_values[corr_values != 1.0]  # Исключаем корреляцию с самим собой
max_corr_pair = corr_values.abs().idxmax()
max_corr_value = corr_values.loc[max_corr_pair]

print(f"\nНаибольшая корреляция: {max_corr_pair} = {max_corr_value:.3f}")

Корреляционная матрица:
               age   balance       day  duration  campaign     pdays  previous
age       1.000000  0.097783 -0.009120 -0.004648  0.004760 -0.023758  0.001288
balance   0.097783  1.000000  0.004503  0.021560 -0.014578  0.003435  0.016674
day      -0.009120  0.004503  1.000000 -0.030206  0.162490 -0.093044 -0.051710
duration -0.004648  0.021560 -0.030206  1.000000 -0.084570 -0.001565  0.001203
campaign  0.004760 -0.014578  0.162490 -0.084570  1.000000 -0.088628 -0.032855
pdays    -0.023758  0.003435 -0.093044 -0.001565 -0.088628  1.000000  0.454820
previous  0.001288  0.016674 -0.051710  0.001203 -0.032855  0.454820  1.000000

Наибольшая корреляция: ('pdays', 'previous') = 0.455


In [None]:
df['y'] = (df['y'] == 'yes').astype(int)

In [None]:
# Разделение данных

from sklearn.model_selection import train_test_split

# Разделяем на признаки и целевую переменную
X = df.drop('y', axis=1)
y = df['y']

X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42
)

print(f"Размер тренировочного набора: {X_train.shape}")
print(f"Размер валидационного набора: {X_val.shape}")
print(f"Размер тестового набора: {X_test.shape}")

Размер тренировочного набора: (27126, 14)
Размер валидационного набора: (9042, 14)
Размер тестового набора: (9043, 14)


In [None]:
# ## Ответ на вопрос 3: Взаимная информация

from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_extraction import DictVectorizer

# Только признаки, указанные в вопросе 3
categorical_for_mi = ['contact', 'education', 'housing', 'poutcome']

# One-hot encoding для категориальных признаков
X_train_categorical = X_train[categorical_for_mi]
train_cat_dict = X_train_categorical.to_dict(orient='records')

dv_cat = DictVectorizer(sparse=False)
X_train_cat_encoded = dv_cat.fit_transform(train_cat_dict)

# Вычисляем взаимную информацию
mi_scores = mutual_info_classif(X_train_cat_encoded, y_train, random_state=42)
feature_names = dv_cat.get_feature_names_out()

# Создаем словарь с результатами
mi_scores_dict = dict(zip(feature_names, mi_scores))

# Группируем по исходным признакам
mi_by_original_feature = {}
for feature_name, score in mi_scores_dict.items():
    original_feature = feature_name.split('=')[0] if '=' in feature_name else feature_name
    if original_feature in mi_by_original_feature:
        mi_by_original_feature[original_feature] += score
    else:
        mi_by_original_feature[original_feature] = score

# Округляем до 2 знаков после запятой
mi_by_original_rounded = {k: round(v, 2) for k, v in mi_by_original_feature.items()}
print("Взаимная информация (mutual_info_classif):")
for feature, score in sorted(mi_by_original_rounded.items(), key=lambda x: x[1], reverse=True):
    print(f"{feature}: {score}")

max_mi_feature = max(mi_by_original_rounded, key=mi_by_original_rounded.get)
print(f"\nПризнак с наибольшей взаимной информацией: {max_mi_feature}")

Взаимная информация (mutual_info_classif):
poutcome: 0.05
contact: 0.03
housing: 0.03
education: 0.01

Признак с наибольшей взаимной информацией: poutcome


In [None]:
from sklearn.feature_extraction import DictVectorizer

train_dict = X_train.to_dict(orient='records')
val_dict = X_val.to_dict(orient='records')
test_dict = X_test.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train_encoded = dv.fit_transform(train_dict)
X_val_encoded = dv.transform(val_dict)
X_test_encoded = dv.transform(test_dict)

print(f"Размерность после one-hot encoding: {X_train_encoded.shape}")

Размерность после one-hot encoding: (27126, 47)


In [None]:
# вопрос 4: Обучение логистической регрессии

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_encoded, y_train)

y_val_pred = model.predict(X_val_encoded)

val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Точность на валидационном наборе: {val_accuracy:.1f}")

Точность на валидационном наборе: 0.9


In [None]:
# вопрос 5
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

model_full = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_full.fit(X_train_encoded, y_train)
y_val_pred_full = model_full.predict(X_val_encoded)
base_accuracy = accuracy_score(y_val, y_val_pred_full)

features_to_eliminate = ['age', 'balance', 'marital', 'previous']
accuracy_differences = {}

feature_names = dv.get_feature_names_out()

for feature in features_to_eliminate:
    feature_indices = []
    for i, name in enumerate(feature_names):
        if feature in ['age', 'balance', 'previous']:
            if name == feature:
                feature_indices.append(i)
        else:
            if name.startswith(feature + '='):
                feature_indices.append(i)

    mask = np.ones(X_train_encoded.shape[1], dtype=bool)
    if feature_indices:
        mask[feature_indices] = False

    X_train_reduced = X_train_encoded[:, mask]
    X_val_reduced = X_val_encoded[:, mask]

    model_reduced = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_reduced.fit(X_train_reduced, y_train)

    y_val_pred_reduced = model_reduced.predict(X_val_reduced)
    reduced_accuracy = accuracy_score(y_val, y_val_pred_reduced)

    difference = base_accuracy - reduced_accuracy
    accuracy_differences[feature] = difference

for feature in features_to_eliminate:
    print(f"{feature}: {accuracy_differences[feature]}")

min_diff_feature = min(accuracy_differences, key=lambda x: abs(accuracy_differences[x]))
print(f"min: {min_diff_feature}")

age: -0.00044238000442387015
balance: -0.0001105950011059953
marital: 0.0
previous: 0.0
min: marital


In [None]:
# вопрос 6

C_values = [0.01, 0.1, 1, 10]
best_accuracy = 0
best_C = None

for C in C_values:
    model_reg = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model_reg.fit(X_train_encoded, y_train)

    y_val_pred_reg = model_reg.predict(X_val_encoded)

    val_accuracy_reg = accuracy_score(y_val, y_val_pred_reg)

    print(f"C = {C}: точность = {val_accuracy_reg:.3f}")

    if val_accuracy_reg > best_accuracy:
        best_accuracy = val_accuracy_reg
        best_C = C

print(f"\nЛучшее значение C: {best_C} с точностью {best_accuracy:.3f}")

C = 0.01: точность = 0.898
C = 0.1: точность = 0.901
C = 1: точность = 0.901
C = 10: точность = 0.901

Лучшее значение C: 1 с точностью 0.901


In [None]:
# Финальная модель
X_final_train = np.vstack([X_train_encoded, X_val_encoded])
y_final_train = np.concatenate([y_train, y_val])

final_model = LogisticRegression(solver='liblinear', C=best_C, max_iter=1000, random_state=42)
final_model.fit(X_final_train, y_final_train)

y_test_pred = final_model.predict(X_test_encoded)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Финальная точность на тестовом наборе: {test_accuracy:.3f}")

Финальная точность на тестовом наборе: 0.899
