In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

### Справка о данных:

Данные представляют собой информацию о работниках. Данные содержат информацию за 10 лет и показывают текущих сотрудников и тех, кто уволился (STATUS сотрудника). 

Наша цель заключается в том, чтобы увидеть, можно ли предсказать STATUS сотрудника на основе представленных данных. 

Целевая переменная - STATUS (ACTIVE или TERMINATED).

In [3]:
data = pd.read_csv('MFG10YearTerminationData.csv')

data

Unnamed: 0,EmployeeID,recorddate_key,birthdate_key,orighiredate_key,terminationdate_key,age,length_of_service,city_name,department_name,job_title,store_name,gender_short,gender_full,termreason_desc,termtype_desc,STATUS_YEAR,STATUS,BUSINESS_UNIT
0,1318,12/31/2006 0:00,1/3/1954,8/28/1989,1/1/1900,52,17,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2006,ACTIVE,HEADOFFICE
1,1318,12/31/2007 0:00,1/3/1954,8/28/1989,1/1/1900,53,18,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2007,ACTIVE,HEADOFFICE
2,1318,12/31/2008 0:00,1/3/1954,8/28/1989,1/1/1900,54,19,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2008,ACTIVE,HEADOFFICE
3,1318,12/31/2009 0:00,1/3/1954,8/28/1989,1/1/1900,55,20,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2009,ACTIVE,HEADOFFICE
4,1318,12/31/2010 0:00,1/3/1954,8/28/1989,1/1/1900,56,21,Vancouver,Executive,CEO,35,M,Male,Not Applicable,Not Applicable,2010,ACTIVE,HEADOFFICE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49648,8258,12/1/2015 0:00,5/28/1994,8/19/2013,12/30/2015,21,2,Valemount,Dairy,Dairy Person,34,M,Male,Layoff,Involuntary,2015,TERMINATED,STORES
49649,8264,8/1/2013 0:00,6/13/1994,8/27/2013,8/30/2013,19,0,Vancouver,Customer Service,Cashier,44,F,Female,Resignaton,Voluntary,2013,TERMINATED,STORES
49650,8279,12/1/2015 0:00,7/18/1994,9/15/2013,12/30/2015,21,2,White Rock,Customer Service,Cashier,39,F,Female,Layoff,Involuntary,2015,TERMINATED,STORES
49651,8296,12/1/2013 0:00,9/2/1994,10/9/2013,12/31/2013,19,0,Kelowna,Customer Service,Cashier,16,F,Female,Resignaton,Voluntary,2013,TERMINATED,STORES


In [4]:
data.isnull().sum()

EmployeeID             0
recorddate_key         0
birthdate_key          0
orighiredate_key       0
terminationdate_key    0
age                    0
length_of_service      0
city_name              0
department_name        0
job_title              0
store_name             0
gender_short           0
gender_full            0
termreason_desc        0
termtype_desc          0
STATUS_YEAR            0
STATUS                 0
BUSINESS_UNIT          0
dtype: int64

In [5]:
def preprocess(data):
    data = data.copy()
    
    # Удалим ненужные признаки
    data = data.drop(['EmployeeID', 'gender_short'], axis=1)
    
    data = data.drop(['terminationdate_key', 'termreason_desc', 'termtype_desc', 'length_of_service'], axis=1)
    
    y = data['STATUS']
    X = data.drop('STATUS', axis=1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = preprocess(data)

In [7]:
X_train.head()

Unnamed: 0,recorddate_key,birthdate_key,orighiredate_key,age,city_name,department_name,job_title,store_name,gender_full,STATUS_YEAR,BUSINESS_UNIT
36271,12/31/2011 0:00,8/18/1979,10/30/2005,32,Abbotsford,Bakery,Baker,1,Male,2011,STORES
3950,12/31/2008 0:00,4/27/1953,12/8/1991,55,Vancouver,Produce,Produce Clerk,41,Female,2008,STORES
26963,12/31/2006 0:00,9/26/1971,8/29/2001,35,New Westminster,Bakery,Baker,21,Male,2006,STORES
29451,12/31/2011 0:00,10/16/1973,9/30/2002,38,Terrace,Processed Foods,Shelf Stocker,32,Male,2011,STORES
3790,12/31/2008 0:00,3/10/1953,11/13/1991,55,Burnaby,Meats,Meat Cutter,5,Male,2008,STORES


Начнем создавать трансформер для данных. Для начала преобразуем временной признак, выделив отдельно год, месяц и день.

In [8]:
class Date_Transform:
    
    def fit(self, X, y):
        return self
    
    def transform(self, X):
        
        for col in X.columns:
            
            X[col] = pd.to_datetime(X[col])
            X[col + '-year'] = X[col].apply(lambda x: x.year)
            X[col + '-month'] = X[col].apply(lambda x: x.month)
            X[col + '-day'] = X[col].apply(lambda x: x.day)
            
            X = X.drop(col, axis=1)
        return X

Будем изменять данные основываясь на типах признаков.

In [9]:
# Разобьем признаки на типы

binary = [
    'gender_full',
    'BUSINESS_UNIT'
]
nominal = [
    'city_name',
    'department_name',
    'job_title'
]
date = [
    'recorddate_key',
    'birthdate_key',
    'orighiredate_key'
]

# Будет обрабатывать каждый тип признаков

transformer_for_binary = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(categories='auto'))
])
transformer_for_nominal = Pipeline(steps=[
    ('nominal', OneHotEncoder())
])
transformer_for_date = Pipeline(steps=[
    ('date', Date_Transform())
])

In [10]:
preprocess_transformer = ColumnTransformer(transformers=[
    ('ordinal', transformer_for_binary, binary),
    ('nominal', transformer_for_nominal, nominal),
    ('date', transformer_for_date, date)
], sparse_threshold=0)

Наконец делаем финальную модель

In [11]:
model = Pipeline(steps=[
    ('preprop', preprocess_transformer),
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

In [12]:
model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprop',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0, transformer_weights=None,
                                   transformers=[('ordinal',
                                                  Pipeline(memory=None,
                                                           steps=[('ordinal',
                                                                   OrdinalEncoder(categories='auto',
                                                                                  dtype=<class 'numpy.float64'>))],
                                                           verbose=False),
                                                  ['gender_full',
                                                   'BUSINESS_UNIT']),
                                                 ('nominal',
                                                  Pipeline(memory=None,
                               

In [15]:
accuracy = model.score(X_test, y_test)

print("Accuracy: {:1.2f}%".format(accuracy*100))

Accuracy: 100.00%
