In [1]:
# импортируем необходимые библиотеки, классы и функции
import numpy as np
import pandas as pd
import dask.dataframe as dd
from dask_ml.impute import SimpleImputer
from dask_ml.preprocessing import (StandardScaler, 
                                   DummyEncoder, 
                                   Categorizer)
from pandas.api.types import CategoricalDtype
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from dask_ml.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from dask_ml.model_selection import train_test_split

In [2]:
# загружаем данные
df_dask = dd.read_csv('Data/StateFarm_missing.csv', sep=';')
df_dask.head()

Unnamed: 0,Customer Lifetime Value,Coverage,Education,EmploymentStatus,Gender,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Response
0,2763.519279,Basic,Bachelor,Employed,F,56274.0,,32.0,5.0,,1.0,No
1,,,Bachelor,Unemployed,F,0.0,,13.0,42.0,,,No
2,,,,Employed,F,48767.0,108.0,,38.0,0.0,,No
3,7645.861827,Basic,Bachelor,,,0.0,106.0,18.0,,,7.0,No
4,2813.692575,Basic,Bachelor,,M,43836.0,73.0,12.0,,,1.0,No


In [3]:
# переименовываем метки зависимой переменной 
# в целочисленные значения
df_dask['Response'] = df_dask['Response'].replace(
    {'No': 0, 'Yes': 1})

In [4]:
# создаем массив меток
y_dask = df_dask.pop('Response')

In [5]:
# разбиваем данные на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(
    df_dask, 
    y_dask,
    test_size=0.3, 
    shuffle=True,
    random_state=42)

In [6]:
# создаем список количественных переменных
number = X_train.select_dtypes(include='number').columns.tolist()
number

['Customer Lifetime Value',
 'Income',
 'Monthly Premium Auto',
 'Months Since Last Claim',
 'Months Since Policy Inception',
 'Number of Open Complaints',
 'Number of Policies']

In [7]:
# создаем список категориальных переменных
categ = X_train.select_dtypes(include='object').columns.tolist()
categ

['Coverage', 'Education', 'EmploymentStatus', 'Gender']

In [8]:
# смотрим уникальные значения категориальных переменных
for col in categ:
    print(X_train[col].value_counts(dropna=False).compute())
    print("")

Basic       3531
Extended    1760
Premium      531
NaN            4
Name: Coverage, dtype: int64

Bachelor                1729
College                 1729
High School or Below    1695
Master                   457
Doctor                   213
NaN                        3
Name: Education, dtype: int64

Employed         3642
Unemployed       1452
Medical Leave     276
Disabled          271
Retired           182
NaN                 3
Name: EmploymentStatus, dtype: int64

F      3024
M      2799
NaN       3
Name: Gender, dtype: int64



In [9]:
# выделим пропуски в отдельную категорию
for col in categ:
    X_train[col] = X_train[col].astype(str)

# смотрим уникальные значения категориальных переменных
for col in categ:
    print(X_train[col].unique().compute())
    print("")

0       Basic
1    Extended
2     Premium
3         nan
Name: Coverage, dtype: object

0                 College
1                Bachelor
2    High School or Below
3                  Master
4                     nan
5                  Doctor
Name: Education, dtype: object

0    Medical Leave
1         Employed
2       Unemployed
3          Retired
4         Disabled
5              nan
Name: EmploymentStatus, dtype: object

0      M
1      F
2    nan
Name: Gender, dtype: object



In [10]:
# создаем списки категорий
coverage_lst = sorted(X_train['Coverage'].unique().compute().tolist())
educ_lst = sorted(X_train['Education'].unique().compute().tolist())
empl_lst = sorted(X_train['EmploymentStatus'].unique().compute().tolist())
gender_lst = sorted(X_train['Gender'].unique().compute().tolist())

# задаем для каждого категориального признака списки категорий
categories = {'Coverage': CategoricalDtype(coverage_lst, 
                                           ordered=False),
              'Education': CategoricalDtype(educ_lst, 
                                            ordered=False),
              'EmploymentStatus': CategoricalDtype(empl_lst, 
                                                   ordered=False),
              'Gender': CategoricalDtype(gender_lst, 
                                         ordered=False)}

In [11]:
# создаем трансформеры
num_pipe = Pipeline([
    ('imp', SimpleImputer()),
    ('scaler', StandardScaler())
])
    
cat_pipe = Pipeline([
    ('categ', Categorizer(categories=categories)),
    ('dum', DummyEncoder())
])

# создаем список трансформеров
transformers = [('num', num_pipe, number),
                ('cat', cat_pipe, categ)]

# передаем список в ColumnTransformer
transformer = ColumnTransformer(transformers=transformers)

# создаем итоговый конвейер
ml_pipe = Pipeline([('tf', transformer), 
                    ('logreg', LogisticRegression(
                        fit_intercept=False, 
                        n_jobs=-1))])

In [12]:
# обучаем конвейер
ml_pipe.fit(X_train, y_train)
# получаем прогнозы для тестовой выборки
pred = ml_pipe.predict(X_test)
# смотрим правильность на тестовой выборке
accuracy_score(y_test, pred)

0.8986623429266315