In [1]:
import pandas as pd
import pickle
import numpy as np
import re
import pymystem3
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, RandomizedSearchCV, StratifiedKFold
from scipy.stats import lognorm as sp_lognorm
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score

In [2]:
# nltk.download('stopwords')
# nltk.download('wordnet')

## Read

In [3]:
train = pd.read_csv('train.csv', sep=';', index_col='id')

In [4]:
train.head()

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,<p>В молодой и дружный коллектив динамично раз...,0
1,<p><strong>Обязанности: </strong>- Прием входя...,0
2,<p><strong>Федеральная сеть аптек </strong>в с...,0
3,<strong>Обязанности:</strong> <ul> <li> <p>раз...,0
4,<p>Производителю сантехники тм Domani Spa треб...,1


In [5]:
train.shape

(31063, 2)

In [6]:
train['target'].value_counts()

0    16471
1    14592
Name: target, dtype: int64

In [7]:
test = pd.read_csv('test.csv', sep=';', index_col='id')

In [8]:
test.shape

(31064, 1)

## Data Preproc

In [9]:
def text_to_wordlist(text):
    text_wo_tags = re.sub("<[^>]*>", "", text.lower())
    text = re.sub('[^a-zA-Zа-яА-ЯёЁ]', ' ', text_wo_tags)
    words = text.lower().strip().split()
    return words

In [10]:
def clean(words, stopWords):
    new_words = [word for word in words if (word not in stopWords) and len(word) > 2]
    return new_words

In [11]:
mystem = pymystem3.Mystem()
wordnet_lemmatizer = WordNetLemmatizer()

def clean_v2(words):    
    new_words = [mystem.lemmatize(x)[0] for x in words]
    return new_words

In [12]:
def clean_v3(words):    
    new_words = [wordnet_lemmatizer.lemmatize(x, pos=wordnet.VERB) for x in words]
    return new_words

In [13]:
def preproc_data(df, stopWords):
    
    temp = df.copy()
    temp['norm_test'] = temp['text'].apply(lambda x: text_to_wordlist(x))
    print('v1 done')
    temp['norm_test_v2'] = temp['norm_test'].apply(lambda x: clean(x, stopWords))
    print('v2 done')
    temp['norm_test_v3'] = temp['norm_test_v2'].apply(lambda x: clean_v2(x))
    print('v3 done')
    temp['norm_test_v4'] = temp['norm_test_v3'].apply(lambda x: clean_v3(x))
    print('v4 done')
    temp['joined_text'] = temp['norm_test_v4'].apply(lambda x: ' '.join(x))
    joined_text = temp['joined_text'].tolist()
    
    return joined_text

In [14]:
%%time

stopWords = stopwords.words(['russian', 'english'])
train_joined_text = preproc_data(train, stopWords)

v1 done
v2 done
v3 done
v4 done
CPU times: user 1min 55s, sys: 36.9 s, total: 2min 32s
Wall time: 5min 45s


In [15]:
%%time
test_joined_text = preproc_data(test, stopWords)

v1 done
v2 done
v3 done
v4 done
CPU times: user 1min 56s, sys: 37.8 s, total: 2min 34s
Wall time: 5min 42s


In [28]:
prepr_train = pd.DataFrame(train_joined_text, columns=['text'])
prepr_train['target'] = train['target']

In [30]:
prepr_test = pd.DataFrame(test_joined_text, columns=['text'])

In [31]:
prepr_train.to_csv('prepr_train.csv', index=False)

In [32]:
prepr_test.to_csv('prepr_test.csv', index=False)

## Read preproc data

In [33]:
prepr_train = pd.read_csv('prepr_train.csv')
prepr_train.head()

Unnamed: 0,text,target
0,молодой дружный коллектив динамично развиватьс...,0
1,обязанность прием входить звонок заявка обрабо...,0
2,федеральный сеть аптека связь активный расшире...,0
3,обязанность разработка проектный рабочий докум...,0
4,производитель сантехник domani spa требоваться...,1


In [38]:
X_test = pd.read_csv('prepr_test.csv')
X_test.head()

Unnamed: 0,text
0,крупный компания организация приготовление кор...
1,обязанность обеспечение необходимый функционал...
2,обязанность отгрузка прием товар склад склад к...
3,обязанность приготовление холодный горячий блю...
4,вакансия срочный внимание просьба подробно изу...


In [35]:
X = prepr_train.drop('target', axis=1)
y = prepr_train.target

In [36]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

In [116]:
train_list = X_train['text'].tolist()
valid_list = X_valid['text'].tolist()
test_list = X_test['text'].tolist()

## Train

In [169]:
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
import random

In [117]:
text_list = train_list + valid_list + test_list

In [144]:
tfidf = TfidfVectorizer()
tfidf.fit_transform(text_list)

<62127x73965 sparse matrix of type '<class 'numpy.float64'>'
	with 5038040 stored elements in Compressed Sparse Row format>

In [146]:
matr_train = tfidf.transform(train_list)
matr_valid = tfidf.transform(valid_list)

In [170]:
# param_grid = {'n_estimators':  range(150, 400, 50),
#           'learning_rate': [.1],
#           'max_depth': range(2, 8, 2),
#           'subsample': [ .5, .6, .95, 1, .1, .3],
#           'gamma': [0, .05, .1, .2, .25, 0.01, 0.02],
#           'colsample_bytree': [1, .10, .50],
#           'colsample_bylevel': [1, .3, .1, .5, .7, .9],
#           'reg_alpha': [0, 0.1, 0.3, 0.5, 0.7, 1],
#           'reg_lambda': [0, 0.1, 0.3, 0.5, 0.7, 1],
#           'scale_pos_weight': [1, 10, 25, 50]}

# clf = XGBClassifier()

In [148]:
pipe = Pipeline([
    ('clf', LogisticRegression(random_state=42))
])

In [128]:
# pipe = Pipeline([
#     ('tfidf', TfidfVectorizer()),
#     ('logreg', LogisticRegression(random_state=42)) 
# ])

In [129]:
# pickle.dump(tfidf, open('tf_idf.pickle', 'wb'))

In [149]:
# param_grid = {
#    # 'tfidf__ngram_range': range(1, 3), 
#    # 'tfidf__max_df': random.uniform(0, 1),
#    # 'tfidf__max_features': range(5000, 45000), 
#     'clf__penalty': ['l1', 'l2'],
#     'clf__C': sp_lognorm(4)
# }

In [171]:
# Задаем схему кросс-валидации
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

# Запускаем поиск гиперпараметров
hyper_search = RandomizedSearchCV(pipe, param_grid, n_iter=30, scoring='roc_auc', cv=cv,
                                  n_jobs=-1, refit=True, random_state=42, verbose=2)

hyper_search.fit(matr_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 29.2min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 131.2min finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
          error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid='warn', n_iter=30, n_jobs=-1,
          param_distributions={'n_estimators': range(150, 400, 50), 'learning_rate': [0.1], 'max_depth': range(2, 10, 2), 'subsample': [0.5, 0.6, 0.95, 1, 0.1, 0.3], 'gamma': [0, 0.05, 0.1, 0.2, 0.25, 0.01, 0.02], 'colsample_bytree': [1, 0.1, 0.5], 'colsample_bylevel': [1, 0.3, 0.1, 0.5, 0.7, 0.9], 'reg_alpha': [0, 0.1, 0.3, 0.5, 0.7, 1], 'reg_lambda': [0, 0.1, 0.3, 0.5, 0.7, 1], 'scale_pos_weight': [

In [172]:
hyper_search.best_params_

{'subsample': 1,
 'scale_pos_weight': 50,
 'reg_lambda': 0.1,
 'reg_alpha': 0.5,
 'n_estimators': 300,
 'max_depth': 8,
 'learning_rate': 0.1,
 'gamma': 0.25,
 'colsample_bytree': 0.5,
 'colsample_bylevel': 1}

In [178]:
hyper_search.best_score_

0.9897515273504104

In [179]:
best_model = hyper_search.best_estimator_

In [183]:
pickle.dump(best_model, open('best_model_xgb.pickle', 'wb'))

In [184]:
pred_valid = best_model.predict_proba(matr_valid)

In [185]:
roc_auc_score(y_valid, pred_valid[:, 1])

0.9912249697249966

## Predict

In [186]:
best_model = pickle.load(open('best_model_xgb.pickle', 'rb'))

In [187]:
matr_test = tfidf.transform(test_list)

In [188]:
pred = best_model.predict_proba(matr_test)

In [189]:
test['target'] = pred[:,1]

In [190]:
test.drop('text', axis=1).to_csv('submission_5.csv')