## Моя моделька

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_score

"""Для датафреймов"""
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv('bank-full.csv', sep=';')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [5]:
df.shape

(45211, 17)

In [6]:
df['y_count'] = df['y'].apply(lambda x: 1 if x == 'yes' else 0)

In [7]:
a = df['y_count'].sum()
print(f'Количество депозитов: {a}')
print(f'Процент людей с депозитом: {(a/len(df)):.1%}')

Количество депозитов: 5289
Процент людей с депозитом: 11.7%


In [8]:
print('poutcome:', df['poutcome'].unique(), 'Доля:', round(len(df[df['poutcome'] == 'unknown'])/len(df)*100,2),'%')
print('job', df['job'].unique(), 'Доля:', round(len(df[df['job'] == 'unknown'])/len(df)*100,2),'%')
print('marital', df['marital'].unique(), 'Доля:', round(len(df[df['marital'] == 'unknown'])/len(df)*100,2),'%')
print('education', df['education'].unique(), 'Доля:', round(len(df[df['education'] == 'unknown'])/len(df)*100,2),'%')
print('contact', df['contact'].unique(), 'Доля:', round(len(df[df['contact'] == 'unknown'])/len(df)*100,2),'%')

poutcome: ['unknown' 'failure' 'other' 'success'] Доля: 81.75 %
job ['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student'] Доля: 0.64 %
marital ['married' 'single' 'divorced'] Доля: 0.0 %
education ['tertiary' 'secondary' 'unknown' 'primary'] Доля: 4.11 %
contact ['unknown' 'cellular' 'telephone'] Доля: 28.8 %


In [9]:
"""1-yes, 0-no"""
df['default'] = df['default'].apply(lambda x: 1 if x == 'yes' else 0)
df['housing'] = df['housing'].apply(lambda x: 1 if x == 'yes' else 0)
df['loan'] = df['loan'].apply(lambda x: 1 if x == 'yes' else 0)

In [10]:
"""Проверка корреляции для признака 'poutcome' """
pd.crosstab(df['poutcome'], df['y'])

y,no,yes
poutcome,Unnamed: 1_level_1,Unnamed: 2_level_1
failure,4283,618
other,1533,307
success,533,978
unknown,33573,3386


In [11]:
def choose(x):
    if x == 'unknown':
        return 0
    elif x == 'success':
        return 1
    return -1

df['poutcome_count'] = df['poutcome'].apply(choose)

In [12]:
print(df['poutcome_count'].value_counts())

poutcome_count
 0    36959
-1     6741
 1     1511
Name: count, dtype: int64


In [13]:
"""Проверка корреляции для признака 'poutcome' """
pd.crosstab(df['education'], df['y'])

y,no,yes
education,Unnamed: 1_level_1,Unnamed: 2_level_1
primary,6260,591
secondary,20752,2450
tertiary,11305,1996
unknown,1605,252


In [14]:
"""Проверка корреляции для признака 'poutcome' """
pd.crosstab(df['contact'], df['y'])

y,no,yes
contact,Unnamed: 1_level_1,Unnamed: 2_level_1
cellular,24916,4369
telephone,2516,390
unknown,12490,530


In [15]:
"""Сделаем для education кодировку"""
def education_cod(x):
    if x == 'primary':
        return 1
    elif x == 'secondary':
        return 2
    elif x == 'tertiary':
        return 3
    return 0

df['education'] = df['education'].apply(education_cod)

In [16]:
"""Решил пока что исключить признак 'contact', из-за слабой дифференцированности переменной y """

"Решил пока что исключить признак 'contact', из-за слабой дифференцированности переменной y "

In [17]:
pd.crosstab(df['default'], df['y'])

y,no,yes
default,Unnamed: 1_level_1,Unnamed: 2_level_1
0,39159,5237
1,763,52


In [18]:
def balance_quantile(x):
    low, mid, upper = df['balance'].quantile([0.25, 0.5, 0.75])
    if x <= low:
        return 'очень низкий'
    elif x <= mid:
        return 'низкий'
    elif x <= upper:
        return 'средний'
    return 'высокий'
     

df['balance_group'] = df['balance'].apply(balance_quantile)

In [19]:
pd.crosstab(df['balance_group'], df['y'])

y,no,yes
balance_group,Unnamed: 1_level_1,Unnamed: 2_level_1
высокий,9472,1825
низкий,10061,1230
очень низкий,10498,819
средний,9891,1415


In [20]:
def balance_int(x):
    if x == 'очень низкий':
        return 1
    elif x == 'низкий':
        return 2
    elif x == 'средний':
        return 3
    return 4
     

df['balance_int'] = df['balance_group'].apply(balance_int)

In [21]:
def duration_quantile(x):
    low, mid, upper = df['duration'].quantile([0.25, 0.5, 0.75])
    if x <= low:
        return 'очень низкий'
    elif x <= mid:
        return 'низкий'
    elif x <= upper:
        return 'средний'
    return 'высокий'
     

df['duration_group'] = df['duration'].apply(duration_quantile)

In [22]:
pd.crosstab(df['duration_group'], df['y'])

y,no,yes
duration_group,Unnamed: 1_level_1,Unnamed: 2_level_1
высокий,7946,3315
низкий,10703,582
очень низкий,11248,127
средний,10025,1265


In [23]:
def duration_int(x):
    if x == 'очень низкий':
        return 1
    elif x == 'низкий':
        return 2
    elif x == 'средний':
        return 3
    return 4
     

df['duration_int'] = df['duration_group'].apply(duration_int)

In [24]:
def campaign_quantile(x):
    low, mid, upper = df['campaign'].quantile([0.25, 0.5, 0.75])
    if x <= low:
        return 'очень низкий'
    elif x <= mid:
        return 'низкий'
    elif x <= upper:
        return 'средний'
    return 'высокий'
     

df['campaign_group'] = df['campaign'].apply(campaign_quantile)

In [25]:
def campaign_int(x):
    if x == 'очень низкий':
        return 1
    elif x == 'низкий':
        return 2
    elif x == 'средний':
        return 3
    return 4
     

df['campaign_int'] = df['campaign_group'].apply(campaign_int)

In [26]:
pd.crosstab(df['campaign_group'], df['y'])

y,no,yes
campaign_group,Unnamed: 1_level_1,Unnamed: 2_level_1
высокий,8932,709
низкий,11104,1401
очень низкий,14983,2561
средний,4903,618


In [27]:
"""Пока в модель пойдут: age, job, education, default, balance_int, housing, loan, duration_int, campaign_int, poutcome_count"""

'Пока в модель пойдут: age, job, education, default, balance_int, housing, loan, duration_int, campaign_int, poutcome_count'

In [28]:
df_for_model = df[['age', 'job', 'education', 'default', 'balance_int', 'housing', 'loan', 'duration_int', 'campaign_int', 'poutcome_count', 'y_count']]


job_dummies = pd.get_dummies(df['job'], prefix='job', dtype=int)
df_for_model = pd.concat([df_for_model.drop('job', axis=1), job_dummies], axis=1)
print(job_dummies.columns.tolist())

X = df_for_model.drop('y_count', axis=1)
y = df_for_model['y_count']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

train = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
test = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)

['job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self-employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'job_unknown']


In [29]:
print(f"Train size: {len(train)}")
print(f"Test size: {len(test)}")
print(f"Train target %: {train['y_count'].mean():.1%}")
print(f"Test target %: {test['y_count'].mean():.1%}")

Train size: 36168
Test size: 9043
Train target %: 11.7%
Test target %: 11.7%


In [30]:
class MyLogRegression():
    
    def __init__(self, learning_rate, n_iterations, lambda_param=1): # подобранная лямбда
        self.w = None
        self.b = 0 # смещение
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.lambda_param = lambda_param 
        
        
    def fit(self, X, y):
        X = np.array(X, dtype=float)
        y = np.array(y, dtype=float)
        cnt = X.shape[1]
        m = X.shape[0]
        w = np.random.randn(cnt) * 0.01

        for i in range(self.n_iterations):
            z = X @ w + self.b
            y_pred = self.sigmoid(z)
            # Добавляем регуляризацию в градиент:
            dw = (1/m) * (X.T @ (y_pred - y)) + (self.lambda_param / m) * w
            db = (1/m) * np.sum(y_pred - y)
            w = w - self.learning_rate * dw
            self.b = self.b - self.learning_rate * db
            
        self.w = w   
        
        
    def sigmoid(self, z):
        return 1/(1 + np.exp(-z))
            
        
    def predict(self, X):
        X = np.array(X, dtype=float)
        z = X @ self.w + self.b
        y_pred = self.sigmoid(z)
        return (y_pred > 0.45).astype(int) # подобранный порог
    
    
    def predict_proba(self, X):
        X = np.array(X, dtype=float)
        z = X @ self.w + self.b
        return self.sigmoid(z)
        
    

In [31]:
model = MyLogRegression(learning_rate=0.001, n_iterations=2000)

#model.fit(X_train, y_train) так как данные несбалансированы

In [32]:
# y_pred = model.predict(X_test)
# print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
# print(f"F1-score: {f1_score(y_test, y_pred):.3f}")

In [33]:
"""Из-за дисбаланса классов модель плохо предсказывает"""

'Из-за дисбаланса классов модель плохо предсказывает'

In [None]:
"""Для того чтобы улучшить метрику F1 нам надо сбалансировать данные"""
"""В этом нам поможет undersampling"""

df_0 = train[train['y_count'] == 0]
df_1 = train[train['y_count'] == 1]

df_0_balanced = df_0.sample(n=len(df_1), random_state=42)
train_balanced = pd.concat([df_0_balanced, df_1])
train_balanced = train_balanced.sample(frac=1, random_state=42).reset_index(drop=True) 

X_balanced = train_balanced.drop('y_count', axis=1)
y_balanced = train_balanced['y_count']

"""Добавим валидацию для подбора гипперпараметров"""
X_train_bal, X_val, y_train_bal, y_val = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)

"""Подберем лямбду"""
# lambda_values = [0.001, 0.01, 0.1, 1, 10]
# best_f1 = 0
# best_lambda = 0

# for lambda_param in lambda_values:
#     model = MyLogRegression(learning_rate=0.01, n_iterations=1000, lambda_param=lambda_param)
#     model.fit(X_train_bal, y_train_bal)
#     y_val_pred = model.predict(X_val)
#     f1 = f1_score(y_val, y_val_pred)
    
#     print(f"lambda={lambda_param}, F1={f1:.3f}")
    
#     if f1 > best_f1:
#         best_f1 = f1
#         best_lambda = lambda_param

# print(f"Лучшая lambda: {best_lambda}, F1: {best_f1:.3f}")

"""Подбираем порог"""
# model.fit(X_train_bal, y_train_bal)

# y_val_proba = model.predict_proba(X_val)

# best_f1 = 0
# best_threshold = 0

# for threshold in np.arange(0.05, 0.5, 0.01):
#     y_val_pred = (y_val_proba > threshold).astype(int)
#     current_f1 = f1_score(y_val, y_val_pred)
#     if current_f1 > best_f1:
#         best_f1 = current_f1
#         best_threshold = threshold

# print(f"Лучший порог: {best_threshold:.2f}, F1 на валидации: {best_f1:.3f}")

# # Проверяем на ТЕСТЕ с найденным порогом
# y_test_proba = model.predict_proba(X_test)
# y_test_pred = (y_test_proba > best_threshold).astype(int)
# test_f1 = f1_score(y_test, y_test_pred)
# print(f"F1 на тесте: {test_f1:.3f}")

'Подираем порог'

In [35]:
"""Обучим модель на сбалансированной выборке"""
model.fit(X_balanced, y_balanced)

In [37]:
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"F1-score: {f1_score(y_test, y_pred):.3f}")

Accuracy: 0.534
F1-score: 0.313


In [None]:
"""Вывод: градиентный спуск не подоходит для этой задачи. Потэтому оставлю как есть и перейду к изучению других тем и алгоритмов"""