In [28]:
import pandas as pd
pd.set_option('display.max_column', 999)
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score , StratifiedKFold

import pymorphy2
from sklearn.preprocessing import LabelEncoder

%matplotlib inline

### Функция для кроссвалидации модели градиентного бустинга

In [29]:
def validate(x , y):
    model = XGBClassifier(max_depth = 10 , n_estimators=670 , learning_rate=0.09 , colsample_bytree=0.9 , colsample_bylevel=0.6)
    cv = StratifiedKFold(4 ,shuffle=True, random_state=99)
    score = cross_val_score(model , x , y , scoring='roc_auc' , cv=cv)
    print (score.mean() , score.std() , '\n')

### Загружаем данные

In [30]:
train = pd.read_csv('linear_train.txt', sep = ',', header=None, names = ['Word', 'Answer'])
test = pd.read_csv('linear_test.txt', sep = ',', header=None, names = ['Word', 'Answer'])


# Сгенерируем дополнительные фичи

In [31]:
train['Upper'] = train['Word'].apply(lambda x: 1 if (x[0].isupper() and x[1:].islower()) else 0)
test['Upper'] = test['Word'].apply(lambda x: 1 if (x[0].isupper() and x[1:].islower()) else 0)

train['Lower'] = train['Word'].apply(lambda x: 1 if (x.islower()) else 0)
test['Lower'] = test['Word'].apply(lambda x: 1 if (x.islower()) else 0)

train['All_upper'] = train['Word'].apply(lambda x: 1 if (x.isupper()) else 0)
test['All_upper'] = test['Word'].apply(lambda x: 1 if (x.isupper()) else 0)

train['Mixed'] = train['Word'].apply(lambda x: 1 if not x.isupper() and not x.islower() else 0)
test['Mixed'] = test['Word'].apply(lambda x: 1 if not x.isupper() and not x.islower() else 0)

train['Mixed'] = train['Mixed'] - train['Upper']
test['Mixed'] = test['Mixed'] - test['Upper']

train['Length'] = train['Word'].apply(lambda x: len(x))
test['Length'] = test['Word'].apply(lambda x: len(x))


In [32]:
all_data = pd.concat([train , test])

sample = pd.read_csv('linear_ans_example.txt', sep = ',')

### Используем морфологический анализатор pymorphy2

In [33]:
%%time
morph = pymorphy2.MorphAnalyzer()

all_data['pymorphy'] = all_data['Word'].apply(lambda x: morph.tag(x)[0])

all_data['pymorphy_animacy'] = all_data['pymorphy'].apply(lambda x: x.animacy)
all_data['pymorphy_POS'] = all_data['pymorphy'].apply(lambda x: x.POS)
all_data['pymorphy_case'] = all_data['pymorphy'].apply(lambda x: x.case)
all_data['pymorphy_number'] = all_data['pymorphy'].apply(lambda x: x.number)
all_data['pymorphy_gender'] = all_data['pymorphy'].apply(lambda x: x.gender)

all_data.drop('pymorphy' , axis=1 , inplace=True)

columns_to_one_hot = ['pymorphy_animacy', 'pymorphy_POS', 'pymorphy_case','pymorphy_number', 'pymorphy_gender']

for col in columns_to_one_hot:
    all_data[col] = LabelEncoder().fit_transform(list(all_data[col].fillna('nan')))

Wall time: 47.9 s


### cross-val-score

In [21]:
new_train = all_data[all_data['Answer'].notnull()]
new_test = all_data[all_data['Answer'].isnull()]
validate(new_train.drop(['Answer' , 'Word'] , axis=1) , new_train['Answer'])

0.8838351830834492 0.0019968552838658163 



### Обучаем модель и загружаем предикты в файл

In [34]:
model = XGBClassifier(max_depth = 10 , n_estimators=670 , learning_rate=0.09 , colsample_bytree=0.9 , colsample_bylevel=0.6)
model.fit(new_train.drop(['Answer' , 'Word'] , axis=1) , new_train['Answer'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.6,
       colsample_bytree=0.9, gamma=0, learning_rate=0.09, max_delta_step=0,
       max_depth=10, min_child_weight=1, missing=None, n_estimators=670,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [None]:
sample['Answer'] = model.predict_proba(new_test.drop(['Word' , 'Answer'] , axis=1))[:,0]
sample.to_csv('submit.csv' , index=False)

In [None]:
new_test.drop(['Word', 'Answer'], axis = 1)