In [1]:
import pandas as pd
pd.set_option('display.max_column', 999)
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score , StratifiedKFold
from tqdm import tqdm_notebook
from xgboost import plot_importance
import pymorphy2
from natasha import PersonExtractor
from matplotlib import pyplot as plt
import pymystem3
from sklearn.preprocessing import LabelEncoder

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas(desc="apply")

%matplotlib inline



In [2]:
def validate(x , y):
    model = LGBMClassifier(n_estimators=670 ,learning_rate=0.09 , num_leaves=53 , subsample=0.9 , subsample_for_bin=60000 , min_child_samples = 10 , random_state = 42)
    cv = StratifiedKFold(4 ,shuffle=True, random_state=99)
    score = cross_val_score(model , x , y , scoring='roc_auc' , cv=cv)
    print (score.mean() , score.std() , '\n')

### Загружаем данные

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
all_data = pd.concat([train , test])

sample = pd.read_csv('data/sample_submission.csv')

### pymorphy2

In [4]:
%%time
morph = pymorphy2.MorphAnalyzer()

def name_score(word):
    for p in morph.parse(word):
        if 'Name' in p.tag:
            return p.score
    return 0

def surn_score(word):
    for p in morph.parse(word):
        if 'Surn' in p.tag:
            return p.score
    return 0


all_data['pymorphy'] = all_data['Word'].apply(lambda x: morph.tag(x)[0])

all_data['pymorphy_animacy'] = all_data['pymorphy'].apply(lambda x: x.animacy)
all_data['pymorphy_POS'] = all_data['pymorphy'].apply(lambda x: x.POS)
all_data['pymorphy_case'] = all_data['pymorphy'].apply(lambda x: x.case)
all_data['pymorphy_number'] = all_data['pymorphy'].apply(lambda x: x.number)
all_data['pymorphy_gender'] = all_data['pymorphy'].apply(lambda x: x.gender)

all_data.drop('pymorphy' , axis=1 , inplace=True)

columns_to_one_hot = ['pymorphy_animacy', 'pymorphy_POS', 'pymorphy_case','pymorphy_number', 'pymorphy_gender']

for col in columns_to_one_hot:
    all_data[col] = LabelEncoder().fit_transform(list(all_data[col].fillna('nan')))

CPU times: user 43.6 s, sys: 68 ms, total: 43.7 s
Wall time: 43.7 s


### cross-val-score

In [5]:
new_train = all_data[all_data['Label'].notnull()]
new_test = all_data[all_data['Label'].isnull()]
validate(new_train.drop(['Label' , 'Word'] , axis=1) , new_train['Label'])

0.775153135333 0.00359747723732 



### Если добавить основные функции строк , isupper , isupper , len , и другие фишки, легко можно получить 0.88

In [None]:
### score on leaderboard 0.95399
model = LGBMClassifier(n_estimators=670 ,learning_rate=0.09 , num_leaves=53 , subsample=0.9 , subsample_for_bin=60000 , min_child_samples = 10 , random_state = 42)
model.fit(new_train.drop(['Label' , 'Word'] , axis=1) , new_train['Label'])
sample['Prediction'] = model.predict_proba(new_test.drop(['Word' , 'Label'] , axis=1))[:,0]
sample.to_csv('submit.csv' , index=False)