## HSE natural language processing
### HW 02

In [1]:
import os
import re
import string
import numpy as np
import nltk

In [24]:
from tqdm import tqdm_notebook as tqdm
from collections import defaultdict
from pymystem3 import Mystem
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from sklearn.svm import SVC
from nltk.corpus import stopwords
from sklearn import svm

In [3]:
RESOURCES_PATH = './resources'
TRAIN_TEXTS_PATH = os.path.join(RESOURCES_PATH, 'texts_train.txt')
TRAIN_SCORES_PATH = os.path.join(RESOURCES_PATH, 'scores_train.txt')
TEST_INPUT_FILENAME = os.path.join(RESOURCES_PATH, 'test.in')
TEST_OUTPUT_FILENAME = os.path.join(RESOURCES_PATH, 'test.out')

In [4]:
def load_scores():
    with open(TRAIN_SCORES_PATH, 'r') as scores_file:
        return np.array(list(map(int, scores_file.readlines())))

In [5]:
def load_texts():
    with open(TRAIN_TEXTS_PATH, 'r') as texts_file:
        return np.array(texts_file.readlines())

In [6]:
def run_task(evaluator):
    with open(TEST_INPUT_FILENAME, 'r') as input_file:
        with open(TEST_OUTPUT_FILENAME, 'w') as output_file:
            for line in tqdm(input_file.readlines()):
                output_file.write(evaluator(line))
                output_file.write(os.linesep)

#### Base approach: average mark

In [7]:
avg_mark = int(round(load_scores().mean()))
print('Average mark: {}'.format(avg_mark))

Average mark: 8


In [8]:
run_task(lambda sentence: str(avg_mark))

#### Simple approach: avreage unigram score

In [9]:
texts = load_texts()

In [10]:
scores = load_scores()

In [33]:
def parse_line_to_tokens(line):
    return [token for token in re.split('\W+', line.lower().replace(os.linesep, '')) if token != '']

In [36]:
marks = defaultdict(lambda: [])
last = defaultdict(lambda: -1)

for ind, (text, score) in tqdm(enumerate(zip(texts, scores))):
    for token in parse_line_to_tokens(text):
        if last[token] != ind:
            marks[token].append(score)
            last[token] = ind

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [39]:
average_mark = {token : 1. * sum(marks[token]) / len(marks[token]) for token in marks.keys()}

In [48]:
def get_average_mark(word):
    return average_mark[word] if word in average_mark else avg_mark

In [51]:
def get_text_average_mark(line):
    return str(int(round(np.array([get_average_mark(token) for token in parse_line_to_tokens(line)]).mean())))

In [53]:
run_task(get_text_average_mark)

#### Third approach: tf–idf + linear model

In [26]:
texts = load_texts()

In [27]:
scores = load_scores()

In [28]:
def build_tf_idf_vectorizer():
    vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 3))
    x_train = vectorizer.fit_transform(texts)
    return vectorizer, x_train

In [30]:
vectorizer, x_train = build_tf_idf_vectorizer()

In [31]:
y_train = np.array(scores)

In [32]:
def process_sentence(sentence, classifier):
    x_test = vectorizer.transform([sentence])
    return str(classifier.predict(x_test)[0])

#### SGD classifier

In [36]:
sgd_classifier = linear_model.SGDClassifier()
sgd_classifier.fit(x_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [35]:
run_task(lambda sentence: process_sentence(sentence, sgd_classifier))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




Score: 67.82

#### SVM classifier

In [39]:
svc = SVC(gamma='auto')
svc.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [40]:
run_task(lambda sentence: process_sentence(sentence, svc))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




Score: 55.5

#### Ridge classifier

In [43]:
ridge = linear_model.RidgeClassifier().fit(x_train, y_train)

In [44]:
run_task(lambda sentence: process_sentence(sentence, ridge))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




Score: 70.02

#### Logistic regression

In [47]:
lr = LogisticRegression(random_state=29239).fit(x_train, y_train)



In [48]:
run_task(lambda sentence: process_sentence(sentence, lr))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




Score: 67.51

Let's modify the vectorizer.

#### Lemmatization via Mystem

In [7]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/itukh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
m = Mystem()

In [14]:
punct_set = set(string.punctuation)
punct_set.add(os.linesep)
punct_set.add('...')

In [33]:
def lemmatize_sentence(sentence):
    try:
        lemmas = m.lemmatize(sentence)
        process_lemma = lambda lemma: ' ' if lemma == ' ' else lemma.replace(' ', '') if lemma.replace(' ', '') not in punct_set else ' '
        return ''.join([process_lemma(lemma) for lemma in lemmas if lemma not in punct_set])
    except:
        return sentence

In [34]:
def build_tf_idf_vectorizer_with_lemmatization():
    texts = load_texts()
    vectorizer = TfidfVectorizer(lowercase=True, ngram_range=(1, 3), stop_words=stopwords.words("russian"))
    lemmatize_texts = []
    for sentence in tqdm(texts):
        lemmatize_texts.append(lemmatize_sentence(sentence))
    x_train = vectorizer.fit_transform(lemmatize_texts)
    return vectorizer, x_train, lemmatize_texts

In [35]:
vectorizer, x_train, lemmatize_texts = build_tf_idf_vectorizer_with_lemmatization()

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




#### SVR

In [39]:
def process_sentence_with_leammatization(sentence, classifier):
    x_test = vectorizer.transform([lemmatize_sentence(sentence)])
    return str(int(round(classifier.predict(x_test)[0])))

In [38]:
svr = svm.SVR().fit(x_train, y_train)



In [40]:
run_task(lambda sentence: process_sentence_with_leammatization(sentence, svr))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




Score: 68.84

#### Ridge

In [41]:
ridge_c = linear_model.RidgeClassifier().fit(x_train, y_train)

In [42]:
run_task(lambda sentence: process_sentence_with_leammatization(sentence, ridge_c))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




Score: 67.24

Let's try ridge regressor instead of classifier.

####  Ridge regressor

In [44]:
ridge = linear_model.Ridge().fit(x_train, y_train)

In [45]:
run_task(lambda sentence: process_sentence_with_leammatization(sentence, ridge))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




Score: 75.59