In [187]:
import pandas as pd
import numpy as np
import re
from sklearn.pipeline import Pipeline

In [152]:
grammar_train = pd.read_csv('./data/grammar_train.csv')
grammar_check = pd.read_csv('./data/grammar_check.csv')

In [13]:
y = grammar_train['correct_sentence'].values
predicts = grammar_train['sentence_with_a_mistake'].values

In [16]:
def get_F_score(ground_truth, predicts, originals):
    """
    param: ground_truth: list of str: correct sentence
    param: predicts: list of str:  predicts
    param: originals: list of str: input sentence
    """
    TP = 0
    FP = 0
    FN = 0
    for y, pred, original in zip(ground_truth, predicts, originals):
        if pred == y:
            TP += 1
        elif pred == original:
            FN += 2
        else:
            FP += 30
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = 2*(precision*recall)/(precision+recall)
    return F1

In [17]:
get_F_score(y, predicts, predicts)

0.06976744186046512

## Classification

In [68]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score

In [50]:
with open('./stopwords-ru.txt') as F:
    stop_words = F.read()
stop_words = stop_words.split('\n')

In [196]:
def transform_sequences(original_sequences, predictions):
    result = []
    for sequence, pred in zip(original_sequences, predictions):
        if pred == 0:
            tcya_index = sequence.find('тся')
            ticya_index = sequence.find('ться')
            if tcya_index != -1 and ticya_index == -1:
                sequence = sequence[:tcya_index] + 'ться' + sequence[tcya_index + 3:]
            if tcya_index == -1 and ticya_index != -1:
                sequence = sequence[:tcya_index] + 'тся' + sequence[tcya_index + 4:]
        result.append(sequence)
    return result

In [198]:
def scoring(pred, threshold):
    predict_tf = pred < threshold
    predict = transform_sequences(test_data, predict_tf)
    ground_truth = transform_sequences(test_data, test_y)
    res = get_F_score(ground_truth, predict, test_data)
    return res, threshold

In [51]:
data = pd.read_csv('./grammar_example_data.csv')

In [57]:
train_data, test_data, train_y, test_y = train_test_split(data.x, data.y, test_size=0.1, random_state=42,
                                                         shuffle=True, stratify=data.y)

In [190]:
def generate_model(params):
    tfidf = TfidfVectorizer(
        analyzer=params['analyzer'], 
        ngram_range=(params['range_min'], params['range_max']),
        min_df=params['min_df'],
        stop_words=stop_words
    )
    lr = LogisticRegression(
        solver='liblinear',
        class_weight='balanced',
        random_state=42,
        n_jobs=1,
    )
    return Pipeline([('tfidf', tfidf), ('lr', lr)])

In [191]:
params = {
    'analyzer': 'word',
    'range_min': 1,
    'range_max': 2,
    'min_df': 5,
}
model = generate_model(params)

In [193]:
model.fit(train_data, train_y)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,...alty='l2', random_state=42,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [194]:
predict = model.predict_proba(test_data)[:,0]

In [199]:
result = []
for threshold in np.linspace(0, 1, 20):
    result.append(scoring(predict, threshold))
max(result, key=lambda x: x[0])

(0.6324678131464386, 0.894736842105263)

In [200]:
from sklearn.externals import joblib

In [201]:
joblib.dump(model, 'model_spalling')

['model_spalling']