In [1]:
import pandas as pd
import pickle
import numpy as np
import re
import pymystem3
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, RandomizedSearchCV, StratifiedKFold
from scipy.stats import lognorm as sp_lognorm
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_auc_score

In [2]:
# nltk.download('stopwords')
# nltk.download('wordnet')

## Read

In [3]:
train = pd.read_csv('train.csv', sep=';', index_col='id')

In [4]:
train.head()

Unnamed: 0_level_0,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,<p>В молодой и дружный коллектив динамично раз...,0
1,<p><strong>Обязанности: </strong>- Прием входя...,0
2,<p><strong>Федеральная сеть аптек </strong>в с...,0
3,<strong>Обязанности:</strong> <ul> <li> <p>раз...,0
4,<p>Производителю сантехники тм Domani Spa треб...,1


In [5]:
train.shape

(31063, 2)

In [6]:
train['target'].value_counts()

0    16471
1    14592
Name: target, dtype: int64

In [7]:
test = pd.read_csv('test.csv', sep=';', index_col='id')

## Data Preproc

In [8]:
def text_to_wordlist(text):
    text_wo_tags = re.sub("<[^>]*>", "", text.lower())
    text = re.sub('[^a-zA-Zа-яА-ЯёЁ]', ' ', text_wo_tags)
    words = text.lower().strip().split()
    return words

In [9]:
def clean(words, stopWords):
    new_words = [word for word in words if (word not in stopWords) and len(word) > 2]
    return new_words

In [10]:
mystem = pymystem3.Mystem()
wordnet_lemmatizer = WordNetLemmatizer()

def clean_v2(words):    
    new_words = [mystem.lemmatize(x)[0] for x in words]
    return new_words

In [11]:
def clean_v3(words):    
    new_words = [wordnet_lemmatizer.lemmatize(x, pos=wordnet.VERB) for x in words]
    return new_words

In [12]:
def preproc_data(df, stopWords):
    
    temp = df.copy()
    temp['norm_test'] = temp['text'].apply(lambda x: text_to_wordlist(x))
    print('v1 done')
    temp['norm_test_v2'] = temp['norm_test'].apply(lambda x: clean(x, stopWords))
    print('v2 done')
    temp['norm_test_v3'] = temp['norm_test_v2'].apply(lambda x: clean_v2(x))
    print('v3 done')
    temp['norm_test_v4'] = temp['norm_test_v3'].apply(lambda x: clean_v3(x))
    print('v4 done')
    temp['joined_text'] = temp['norm_test_v4'].apply(lambda x: ' '.join(x))
    joined_text = temp['joined_text'].tolist()
    
    return joined_text

In [25]:
%%time

stopWords = stopwords.words(['russian', 'english'])
train_joined_text = preproc_data(train, stopWords)

v1 done
v2 done
v3 done
v4 done
CPU times: user 2min 6s, sys: 39.7 s, total: 2min 46s
Wall time: 6min 21s


In [26]:
tfidf = TfidfVectorizer()

In [27]:
X = tfidf.fit_transform(train_joined_text)
y = train.target

In [28]:
pickle.dump(tfidf, open('tf_idf.pickle', 'wb'))

## Train

In [17]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

In [18]:
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': sp_lognorm(4)
}

In [19]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)

In [20]:
model = LogisticRegression(random_state=42)

In [21]:
random_searcher = RandomizedSearchCV(model, param_grid, n_iter=20, 
                                     random_state=42,
                                     scoring='roc_auc', 
                                     n_jobs=-1, cv=cv, 
                                     verbose=2)

random_searcher.fit(X_train, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   13.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   26.9s finished


RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
          error_score='raise-deprecating',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=20, n_jobs=-1,
          param_distributions={'penalty': ['l1', 'l2'], 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a36561a90>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [22]:
random_searcher.best_params_

{'C': 10.923122445211833, 'penalty': 'l2'}

In [23]:
random_searcher.best_score_

0.9923324224726257

In [24]:
best_model = random_searcher.best_estimator_

In [33]:
pickle.dump(best_model, open('best_model.pickle', 'wb'))

In [25]:
pred_valid = best_model.predict_proba(X_valid)

In [26]:
roc_auc_score(y_valid, pred_valid[:, 1])

0.9907754548963311

## Predict

In [13]:
best_model = pickle.load(open('best_model.pickle', 'rb'))

In [14]:
test.head()

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
31063,<p><strong>В крупную компанию по организации и...
31064,<p><strong>Обязанности:</strong></p> <ul> <li>...
31065,<p> </p> <p><strong>Обязанности:</strong></p> ...
31066,<p><strong>Обязанности:</strong></p> <ul> <li>...
31067,<p><strong>Вакансия СРОЧНАЯ!</strong></p> <p><...


In [15]:
test.shape

(31064, 1)

In [17]:
%%time
stopWords = stopwords.words(['russian', 'english'])
test_joined_text = preproc_data(test, stopWords)

v1 done
v2 done
v3 done
v4 done
CPU times: user 1min 48s, sys: 33.9 s, total: 2min 22s
Wall time: 5min 6s


In [29]:
X_test = tfidf.transform(test_joined_text)

In [35]:
pred = best_model.predict_proba(X_test)

In [40]:
test['target'] = pred[:,1]

In [44]:
test.drop('text', axis=1).to_csv('submission_1.csv')