In [1]:
import numpy as np
import pandas as pd
import re
import gensim
from gensim.models import word2vec
from nltk.tokenize import word_tokenize
from pymorphy2 import MorphAnalyzer
a_morph = MorphAnalyzer()
import logging

In [2]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

In [3]:
def clean(text):
    text = text.lower()
    text = re.sub('(https?://[^\s]+)', '', text)
    text = re.sub('@[^\s]+', '', text)
    text = re.sub('rt', '', text)
    text = re.sub('[^a-zA-Zа-яА-Я]+', ' ', text)
    text = re.sub(' +', ' ', text)
    text = text.strip()
    return text

In [4]:
def lemm(sentence):
    t_sent = word_tokenize(sentence)
    text = [a_morph.parse(i)[0].normal_form for i in t_sent]
    return text

In [5]:
def makeVec(sentence, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in sentence:
        try:
            vec += model[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [6]:
with open('2000english.txt', 'r', encoding='utf-8') as f:
    data = f.readlines()

In [8]:
data = [clean(t) for t in data]

In [9]:
with open('w2v_eng2000.txt', 'w', encoding='utf-8') as f:
    for sentense in data:
        t_sent = word_tokenize(sentense)
        text = [a_morph.parse(i)[0].normal_form for i in t_sent]
        print(*text, file=f)

In [10]:
f = 'w2v_eng2000.txt'
data = gensim.models.word2vec.LineSentence(f)

In [11]:
%%time
model = gensim.models.Word2Vec(data, size=300, window=10, min_count=2, iter=50)

Wall time: 2min 9s


In [12]:
model.init_sims(replace=True)
model_path = "eng2000.bin"

model.wv.save_word2vec_format(model_path, binary=True)

In [13]:
model = gensim.models.KeyedVectors.load_word2vec_format('eng2000.bin', binary=True)

In [32]:
vecs = np.concatenate([makeVec(s, 300) for s in data])
a = np.zeros(((len(vecs)//2), 1))
b = np.ones(((len(vecs)//2), 1))
ab = np.concatenate((a, b), axis=0) #0 +, 1 -
all_data = np.concatenate((vecs, ab), axis=1)
v = pd.DataFrame(all_data)
v = shuffle(v)

In [33]:
X = v[v.columns.drop(300)]
y = v[300]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [34]:
log = LogisticRegression()
log.fit(X_train, y_train)
y_train_pred = log.predict(X_train)
y_test_pred = log.predict(X_test)



In [35]:
precision_score(y_test, y_test_pred), recall_score(y_test, y_test_pred), f1_score(y_test, y_test_pred)

(0.6907216494845361, 0.6733668341708543, 0.6819338422391857)

In [36]:
accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)

0.6875