In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from  sklearn.ensemble import IsolationForest
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing  import LabelEncoder, OrdinalEncoder
from sklearn import linear_model 
from sklearn import tree 
from sklearn import ensemble 
from sklearn import metrics 
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split 
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [27]:
df = pd.read_csv('bank_fin\\bank_fin.csv', sep = ';')

# Подготавка столбца **balance**

In [28]:
df['balance'] = df['balance'].astype(str) \
    .str.replace(' ', '') \
    .str.replace(',', '.') \
    .str.replace('$', '') \
    .astype(float)

df['balance'] = pd.to_numeric(df['balance'], errors='coerce')
median_balance = df['balance'].median()
df['balance'] = df['balance'].fillna(median_balance)

# Заменаем пропуски в **job** и **education** на моду

In [29]:
job_mode = df.loc[df['job'] != 'unknown', 'job'].mode()[0]
df['job'] = df['job'].replace('unknown', job_mode)
education_mode = df.loc[df['education'] != 'unknown', 'education'].mode()[0]
df['education'] = df['education'].replace('unknown', education_mode)

# Удаление выбросов

In [30]:
Q1 = df['balance'].quantile(0.25)
Q3 = df['balance'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df = df[(df['balance'] >= lower_bound) & (df['balance'] <= upper_bound)]

# Создание новой переменной **age_group**

In [31]:
def age_group(age):
    if age < 30:
        return '<30'
    elif 30 <= age < 40:
        return '30-40'
    elif 40 <= age < 50:
        return '40-50'
    elif 50 <= age < 60:
        return '50-60'
    else:
        return '60+'
df['age_group'] = df['age'].apply(age_group)

# Кодирование категорий

In [32]:
le_edu = LabelEncoder()
df['education'] = le_edu.fit_transform(df['education'])
le_age = LabelEncoder()
df['age_group'] = le_age.fit_transform(df['age_group'])

# Кодирование категорий

In [33]:
df['deposit'] = df['deposit'].map({'yes': 1, 'no': 0})
df['default'] = df['default'].map({'yes': 1, 'no': 0})
df['housing'] = df['housing'].map({'yes': 1, 'no': 0})
df['loan'] = df['loan'].map({'yes': 1, 'no': 0})

# Создание **dummy** переменных

In [34]:
nominal_cols = ['job', 'marital', 'contact', 'month', 'poutcome']
df = pd.concat([df, pd.get_dummies(df[nominal_cols], prefix=nominal_cols, drop_first=False)], axis=1)

# Удаление номинальных переменных

In [35]:
df.drop(columns=nominal_cols, inplace=True)
df = df.drop('age', axis=1)

# Создаем **test** и **train**

In [36]:
X = df.drop(['deposit'], axis=1)
y = df['deposit']
cols = ['balance'] + [col for col in X.columns if col != 'balance']
X = X[cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state = 42, test_size = 0.33)

In [37]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, f_classif

# === 1. Модель на всех числовых признаках ===
X_train_all = X_train.select_dtypes(include='number')
X_test_all = X_test.select_dtypes(include='number')

scaler_all = MinMaxScaler()
X_train_all_scaled = scaler_all.fit_transform(X_train_all)
X_test_all_scaled = scaler_all.transform(X_test_all)

logreg_all = LogisticRegression(solver='sag', random_state=42, max_iter=1000)
logreg_all.fit(X_train_all_scaled, y_train)
y_pred_all = logreg_all.predict(X_test_all_scaled)
acc_all = accuracy_score(y_test, y_pred_all)

print("🔹 Accuracy (все признаки):", round(acc_all, 2))


# === 2. Модель только на 15 лучших признаках ===
# Отбор признаков
selector = SelectKBest(score_func=f_classif, k=15)
selector.fit(X_train, y_train)

best_features = X_train.columns[selector.get_support(indices=True)]
X_train_best = X_train[best_features]
X_test_best = X_test[best_features]

# Масштабируем
scaler_best = MinMaxScaler()
X_train_best_scaled = scaler_best.fit_transform(X_train_best)
X_test_best_scaled = scaler_best.transform(X_test_best)

logreg_best = LogisticRegression(solver='sag', random_state=42, max_iter=1000)
logreg_best.fit(X_train_best_scaled, y_train)
y_pred_best = logreg_best.predict(X_test_best_scaled)
acc_best = accuracy_score(y_test, y_pred_best)

print("⭐ Accuracy (топ-15 признаков):", round(acc_best, 2))


🔹 Accuracy (все признаки): 0.77
⭐ Accuracy (топ-15 признаков): 0.8


In [None]:
# selector = SelectKBest(score_func=f_classif, k=15)
# selector.fit_transform(X_train, y_train)

# best_indices = selector.get_support(indices=True)
# best_features = X_train.columns[best_indices]

# print('Топ-15 лучших признаков:')
# for f in best_features:
#     print(f)

Топ-15 лучших признаков:
balance
housing
duration
campaign
pdays
previous
age_group
contact_cellular
contact_unknown
month_mar
month_may
month_oct
month_sep
poutcome_success
poutcome_unknown


In [None]:
# from sklearn.preprocessing import MinMaxScaler
# X_train_selected = X_train[best_features]
# X_test_selected = X_test[best_features]

# # Создаём scaler и обучаем только на обучающих данных!
# scaler = MinMaxScaler()
# X_train_scaled = scaler.fit_transform(X_train_selected)
# X_test_scaled = scaler.transform(X_test_selected)

# # Если нужен DataFrame c теми же именами столбцов:
# import pandas as pd
# X_train_scaled = pd.DataFrame(X_train_scaled, columns=best_features, index=X_train.index)
# X_test_scaled = pd.DataFrame(X_test_scaled, columns=best_features, index=X_test.index)

# print(X_train_scaled.head())
# print(X_test_scaled.head())

# first_pred_mean = round(X_test_scaled.iloc[:, 0].mean(), 2)
# print(f'Среднее значение первого предиктора в тестовой выборке: {first_pred_mean}')

       balance  housing  duration  campaign     pdays  previous  age_group  \
7287  0.426374      1.0  0.131735   0.00000  0.000000  0.000000        0.0   
4766  0.346531      0.0  0.075793   0.00000  0.240936  0.034483        0.0   
5712  0.335242      1.0  0.102346   0.02381  0.419883  0.086207        0.0   
2530  0.407886      0.0  0.034803   0.00000  0.120468  0.086207        0.0   
9294  0.571499      0.0  0.025264   0.00000  0.138012  0.137931        0.0   

      contact_cellular  contact_unknown  month_mar  month_may  month_oct  \
7287               1.0              0.0        0.0        0.0        0.0   
4766               1.0              0.0        0.0        0.0        0.0   
5712               0.0              0.0        0.0        1.0        0.0   
2530               1.0              0.0        0.0        0.0        0.0   
9294               1.0              0.0        0.0        0.0        0.0   

      month_sep  poutcome_success  poutcome_unknown  
7287        0.0     

In [None]:
# # Логистическая регрессия с указанными параметрами
# logreg = LogisticRegression(solver='sag', random_state=42, max_iter=1000)
# logreg.fit(X_train_scaled, y_train)

# # Предсказания
# y_pred = logreg.predict(X_test_scaled)

# # Оценка качества — accuracy
# accuracy = accuracy_score(y_test, y_pred)
# #print(classification_report(y_test, y_pred))

# accuracy = accuracy_score(y_test, y_pred)
# print(f'{accuracy:.2f}')

0.80
