## Решение задачи бинарной классификации

### Логистическая регрессия

y = {-1, 1}

$b(x) = \sigma(<w,x>)$, где $\sigma(z) = \frac{1}{1 + e^{-z}}$

Логистическая регрессия - это линейный классификатор, который кроме классов умеет предсказывать вероятности классов, а именно, $b(x) = P(y = +1 | x)$


In [13]:
import pandas as pd
import numpy as np
import seaborn as sns

In [14]:
np.random.seed(42)

In [15]:
data = pd.read_csv('bike_buyers_clean.csv')

In [16]:
data

Unnamed: 0,ID,Marital Status,Gender,Income,Children,Education,Occupation,Home Owner,Cars,Commute Distance,Region,Age,Purchased Bike
0,12496,Married,Female,40000,1,Bachelors,Skilled Manual,Yes,0,0-1 Miles,Europe,42,No
1,24107,Married,Male,30000,3,Partial College,Clerical,Yes,1,0-1 Miles,Europe,43,No
2,14177,Married,Male,80000,5,Partial College,Professional,No,2,2-5 Miles,Europe,60,No
3,24381,Single,Male,70000,0,Bachelors,Professional,Yes,1,5-10 Miles,Pacific,41,Yes
4,25597,Single,Male,30000,0,Bachelors,Clerical,No,0,0-1 Miles,Europe,36,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,23731,Married,Male,60000,2,High School,Professional,Yes,2,2-5 Miles,North America,54,Yes
996,28672,Single,Male,70000,4,Graduate Degree,Professional,Yes,0,2-5 Miles,North America,35,Yes
997,11809,Married,Male,60000,2,Bachelors,Skilled Manual,Yes,0,0-1 Miles,North America,38,Yes
998,19664,Single,Male,100000,3,Bachelors,Management,No,3,1-2 Miles,North America,38,No


# Обзор данных

In [17]:
# проверим типы колонок в датасете
data.dtypes

ID                   int64
Marital Status      object
Gender              object
Income               int64
Children             int64
Education           object
Occupation          object
Home Owner          object
Cars                 int64
Commute Distance    object
Region              object
Age                  int64
Purchased Bike      object
dtype: object

Оставим в данных только числовые колонки.

In [18]:
num_cols = X.columns[X.dtypes == 'int64'].tolist()
num_cols

['Income', 'Children', 'Cars', 'Age']

In [19]:
X = data[num_cols]
y = data['Purchased Bike']

Проверим сбалансированность классов.

In [20]:
# classes are balanced !
y.value_counts(normalize=True)

No     0.519
Yes    0.481
Name: Purchased Bike, dtype: float64

Переведем классы в числа 0 и 1.

In [21]:
# transform y to numeric column
y = (y == 'Yes').astype(int)
y

0      0
1      0
2      0
3      1
4      1
      ..
995    1
996    1
997    1
998    0
999    1
Name: Purchased Bike, Length: 1000, dtype: int32

## Масштабирование числовых признаков

In [22]:
X['Income']

0       40000
1       30000
2       80000
3       70000
4       30000
        ...  
995     60000
996     70000
997     60000
998    100000
999     60000
Name: Income, Length: 1000, dtype: int64

In [23]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit_transform(X['Income'].values.reshape(-1,1))

array([[-0.51953796],
       [-0.8414326 ],
       [ 0.76804062],
       [ 0.44614598],
       [-0.8414326 ],
       [-1.48522189],
       [ 3.34319779],
       [-0.51953796],
       [-1.16332725],
       [-1.16332725],
       [-0.8414326 ],
       [ 1.08993527],
       [ 3.66509243],
       [-0.51953796],
       [ 0.12425133],
       [-1.48522189],
       [-0.8414326 ],
       [-0.8414326 ],
       [-0.51953796],
       [-1.16332725],
       [-0.51953796],
       [ 0.76804062],
       [-0.51953796],
       [ 0.76804062],
       [-0.51953796],
       [-0.8414326 ],
       [-0.8414326 ],
       [ 1.41182991],
       [ 0.44614598],
       [-1.16332725],
       [-1.16332725],
       [-1.48522189],
       [-1.16332725],
       [ 0.76804062],
       [ 1.08993527],
       [-1.48522189],
       [-1.48522189],
       [-0.8414326 ],
       [-1.16332725],
       [-1.48522189],
       [-0.8414326 ],
       [-0.51953796],
       [-1.48522189],
       [ 3.66509243],
       [-1.16332725],
       [-1

Важные замечания:
- классы sklearn возвращают numpy arrays, а не pandas dataframe, что неудобно
- любое преобразование признаков в ML-задачах нужно обучать только на тренировочных данных

In [24]:
# пример работы с пайплайном
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler.fit(X_train, y_train)

X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

In [25]:
X_train = pd.DataFrame(X_train_sc, index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(X_test_sc, index=X_test.index, columns=X_test.columns)

X_train.head()

Unnamed: 0,Income,Children,Cars,Age
82,-0.809901,-1.146471,-1.276343,0.258456
991,0.160945,-0.526086,-1.276343,-0.705497
789,0.160945,0.094299,0.514597,0.346088
894,0.48456,0.714683,-1.276343,-0.793129
398,-0.809901,-0.526086,-0.380873,-0.442601


Обучим логистическую регрессию и посмотрим на качество модели.

In [26]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred, y_test)

0.572

# Соберем сделанные преобразования данных в pipeline

In [28]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('scaler_', StandardScaler()),
    ('model_', LogisticRegression())
    ])

In [30]:
# пример работы с пайплайном

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

print(accuracy_score(y_test, y_pred))

0.572


## Интерпретация результатов

In [36]:
model.coef_, model.intercept_

(array([[ 0.40584271, -0.09346806, -0.63438378, -0.10676724]]),
 array([-0.09133225]))

In [35]:
pipe['model_'].coef_

array([[ 0.40584271, -0.09346806, -0.63438378, -0.10676724]])

In [39]:
coefs = pd.DataFrame(model.coef_, index=np.arange(len(model.coef_)), columns=X_train.columns)
coefs['Intercept'] = model.intercept_
coefs

Unnamed: 0,Income,Children,Cars,Age,Intercept
0,0.405843,-0.093468,-0.634384,-0.106767,-0.091332
