In [1]:
import warnings
warnings.filterwarnings("ignore")

%run parse_xml.py

In [2]:
xml_df = parse_XML("paraphrases_mod.xml", ["id", "id_1", "id_2", "text_1", "text_2", "jaccard", "class"])
xml_df.head(3)

Unnamed: 0,id,id_1,id_2,text_1,text_2,jaccard,class
0,1,201,8159,Полицейским разрешат стрелять на поражение по ...,Полиции могут разрешить стрелять по хулиганам ...,0.65,0
1,2,202,8158,Право полицейских на проникновение в жилище ре...,Правила внесудебного проникновения полицейских...,0.5,0
2,3,273,8167,Президент Египта ввел чрезвычайное положение в...,Власти Египта угрожают ввести в стране чрезвыч...,0.611429,0


In [3]:
xml_df_fltr = xml_df.loc[xml_df['class'].isin(['-1', '1'])]
xml_df_fltr.shape

(4270, 7)

In [4]:
xml_df_fltr.groupby(['class']).size()

class
-1    2582
1     1688
dtype: int64

In [5]:
import zipfile
import gensim 
import pymorphy2
from nltk.tokenize import sent_tokenize, word_tokenize
import string

morph = pymorphy2.MorphAnalyzer()
list_punctuation = set(string.punctuation)

In [6]:
sentence_one = [] 
sentence_two = list(list_punctuation) + ['"', '’', "''", '``', '`', '—', '«', '»']

# iterate through each sentence in dataset 
for sent_1, sent_2 in zip(xml_df['text_1'], xml_df['text_2']): 
    temp_sent_1 = []
    temp_sent_2 = []

     #tokenize the sentence into words 
    for word in word_tokenize(sent_1):
        if word not in list_punctuation:
            temp_sent_1.append(morph.parse(word.lower())[0].normal_form)  
    for word in word_tokenize(sent_2): 
        if word not in list_punctuation:
            temp_sent_2.append(morph.parse(word.lower())[0].normal_form) 

    sentence_one.append(temp_sent_1) 
    sentence_two.append(temp_sent_2)

In [7]:
data_for_w2v = sentence_one + sentence_two

In [8]:
# Create CBOW model 
model_cbow = gensim.models.Word2Vec(data_for_w2v, min_count = 1, 
size = 300, window = 6, negative=20)
model_cbow.train(data_for_w2v, total_examples=len(data_for_w2v), epochs=30)

(3028437, 3600240)

In [9]:
model_cbow.wv.most_similar(positive=["рф"])

[('россия', 0.5653359889984131),
 ('норвегия', 0.5295307040214539),
 ('малайзия', 0.5271217823028564),
 ('125', 0.4573618471622467),
 ('монетарный', 0.43756282329559326),
 ('донбасс', 0.4372336268424988),
 ('ес', 0.43691155314445496),
 ('тариф', 0.4327203929424286),
 ('срочно', 0.4278368055820465),
 ('инопланетянин', 0.4275597333908081)]

In [10]:
# Create Skip Gram model 
model_sg = gensim.models.Word2Vec(data_for_w2v, min_count = 1, size = 300, 
window = 6, sg = 1) 
model_sg.train(data_for_w2v, total_examples=len(data_for_w2v), epochs=30)

(3028805, 3600240)

In [11]:
model_sg.wv.most_similar(positive=["россия"])

[('сборная', 0.36934131383895874),
 ('липницкий', 0.35827547311782837),
 ('свежеть', 0.35358569025993347),
 ('миронов', 0.3527170419692993),
 ('недельный', 0.35179877281188965),
 ('острава', 0.34980159997940063),
 ('один-десять', 0.34942200779914856),
 ('полк', 0.3456084728240967),
 ('чм', 0.33804041147232056),
 ('фигуристка', 0.333640456199646)]

In [12]:
fltr_sentence_one = [] 
fltr_sentence_two = []
list(list_punctuation) + ['"', '’', "''", '``', '`', '—', '«', '»']

# iterate through each sentence in dataset 
for sent_1, sent_2 in zip(xml_df_fltr['text_1'], xml_df_fltr['text_2']): 
    temp_sent_1 = []
    temp_sent_2 = []

     #tokenize the sentence into words 
    for word in word_tokenize(sent_1):
        if word not in list_punctuation:
            temp_sent_1.append(morph.parse(word.lower())[0].normal_form)  
    for word in word_tokenize(sent_2): 
        if word not in list_punctuation:
            temp_sent_2.append(morph.parse(word.lower())[0].normal_form) 

    fltr_sentence_one.append(temp_sent_1) 
    fltr_sentence_two.append(temp_sent_2)

In [13]:
%run tf-idf.py

In [14]:
tf_idf_sentence_one = compute_tfidf(fltr_sentence_one)
tf_idf_sentence_two = compute_tfidf(fltr_sentence_two)

In [15]:
import numpy as np
from math import sqrt

In [16]:
vec_sentence_one = np.zeros((len(fltr_sentence_one), 300))
vec_sentence_two = np.zeros((len(fltr_sentence_one), 300))

for sent_1, sent_2, tfidf_one, tfidf_two, row_id in zip(fltr_sentence_one, fltr_sentence_two, tf_idf_sentence_one, tf_idf_sentence_two, range(len(fltr_sentence_one))): 
    
    vec_sentence_one[row_id] = sum(tfidf_one[word] * model_cbow[word] / sqrt(sum(i**2 for i in model_cbow[word])) for word in sent_1)
    vec_sentence_two[row_id] = sum(tfidf_two[word] * model_cbow[word] / sqrt(sum(i**2 for i in model_cbow[word])) for word in sent_2)

In [17]:
y = np.zeros(xml_df_fltr.shape[0])

for value, ind in zip(xml_df_fltr['class'], range(xml_df_fltr.shape[0])):
    if value != '-1':
        y[ind] = value

In [18]:
X_cbow = np.hstack((vec_sentence_one, vec_sentence_two))
X_cbow.shape

(4270, 600)

In [19]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [20]:
from imblearn.over_sampling import SMOTE

os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X_cbow, y, test_size=0.3, random_state=0)
os_data_X, os_data_y = os.fit_sample(X_train, y_train)

print("Length of oversampled data is ", len(os_data_X))
print("Proportion of no subscription data in oversampled data is ", np.sum(os_data_y) / len(os_data_X))
print("Proportion of subscription data in oversampled data is ", (1 - np.sum(os_data_y)) / len(os_data_X))



Length of oversampled data is  3690
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  -0.4997289972899729


In [21]:
logreg = LogisticRegression()
logreg.fit(os_data_X, os_data_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.66


In [23]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.72      0.67      0.70       737
         1.0       0.59      0.65      0.62       544

    accuracy                           0.66      1281
   macro avg       0.66      0.66      0.66      1281
weighted avg       0.67      0.66      0.66      1281



In [24]:
vec_sentence_one_sg = np.zeros((len(fltr_sentence_one), 300))
vec_sentence_two_sg = np.zeros((len(fltr_sentence_one), 300))

for sent_1, sent_2, tfidf_one, tfidf_two, row_id in zip(fltr_sentence_one, fltr_sentence_two, tf_idf_sentence_one, tf_idf_sentence_two, range(len(fltr_sentence_one))): 
    
    vec_sentence_one_sg[row_id] = sum(tfidf_one[word] * model_sg[word] / sqrt(sum(i**2 for i in model_sg[word])) for word in sent_1)
    vec_sentence_two_sg[row_id] = sum(tfidf_two[word] * model_sg[word] / sqrt(sum(i**2 for i in model_sg[word])) for word in sent_2)

In [25]:
X_sg = np.hstack((vec_sentence_one_sg, vec_sentence_two_sg))
X_sg.shape

(4270, 600)

In [26]:
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X_sg, y, test_size=0.3, random_state=0)
os_data_X, os_data_y = os.fit_sample(X_train, y_train)

print("Length of oversampled data is ",len(os_data_X))
print("Proportion of no subscription data in oversampled data is ", np.sum(os_data_y) / len(os_data_X))
print("Proportion of subscription data in oversampled data is ", (1 - np.sum(os_data_y)) / len(os_data_X))

Length of oversampled data is  3690
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  -0.4997289972899729


In [27]:
logreg = LogisticRegression()
logreg.fit(os_data_X, os_data_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.67


In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.72      0.69      0.70       737
         1.0       0.60      0.63      0.62       544

    accuracy                           0.67      1281
   macro avg       0.66      0.66      0.66      1281
weighted avg       0.67      0.67      0.67      1281



In [30]:
with zipfile.ZipFile('ru_news_model.zip', 'r') as archive:
    stream = archive.open('model.bin')
    model_w2v_rv = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)

# If you don't have this zip file on your computer you can use the following code:
    
# import zipfile
# model_url = 'http://vectors.nlpl.eu/repository/11/184.zip'
# m = wget.download(model_url)
# model_file = model_url.split('/')[-1]
# with zipfile.ZipFile(model_file, 'r') as archive:
#     stream = archive.open('model.bin')
#     model_w2v_rv = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)

In [31]:
import text_processing_rv
%run text_processing_rv.py
import os

# URL of the UDPipe model
udpipe_model_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model'
udpipe_filename = udpipe_model_url.split('/')[-1]

if not os.path.isfile(udpipe_filename):
    print('UDPipe model not found. Downloading...', file=sys.stderr)
    wget.download(udpipe_model_url)

print('\nLoading the model...', file=sys.stderr)
model = Model.load(udpipe_filename)
process_pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')

UDPipe model not found. Downloading...


100% [........................................................................] 40616122 / 40616122


Loading the model...


In [32]:
process(process_pipeline, text=unify_sym('Владимир Путин принял указ о новостях.'.strip()))

['владимир::путин_PROPN ',
 'принимать_VERB',
 'указ_NOUN',
 'о_ADP',
 'новость_NOUN']

In [33]:
mapping_rv_names = dict()
with open("mapping_rv.txt", encoding="utf-8") as file:
    for line in file:
        values = line.split(' ')
#         print(values)
        mapping_rv_names[values[0]] = values[1].strip('\n')
        
# print(mapping_rv_names)     
from stop_words import get_stop_words

stop_words = get_stop_words('russian')
stop_words.append(['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 
                   'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 
                   'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 
                   'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 
                   'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 
                   'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 
                   'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 
                   'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 
                   'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 'впрочем', 'хорошо', 'свою', 'этой', 'перед', 
                   'иногда', 'лучше', 'чуть', 'том', 'нельзя', 'такой', 'им', 'более', 'всегда', 'конечно', 'всю', 'между'])

In [34]:
udp_sentence_one = []
udp_sentence_two = []
udp_set_words = set([])

for sent in xml_df_fltr['text_1']:   
    sent_coded = [word.rstrip() for word in process(process_pipeline, text=unify_sym(sent.strip()))] # trim strange spases in PROPN
    sent_coded_upd = []
    for word in sent_coded:
        if word in mapping_rv_names.keys():
            sent_coded_upd.append(mapping_rv_names[word])
        elif word[-4:] not in ['_ADP', '_DET', '_SYM'] and word[-5:] not in ['_PART', '_PRON', 
                            'CCONJ', 'SCONJ'] and word[:word.index('_')] not in stop_words:
            sent_coded_upd.append(word.lstrip('-')) # были слова, начинавшиеся с '-'
    udp_sentence_one.append(sent_coded_upd) 
    for word in sent_coded_upd:
        udp_set_words.add(word)
        
for sent in xml_df_fltr['text_2']:   
    sent_coded = [word.rstrip() for word in process(process_pipeline, text=unify_sym(sent.strip()))] # trim strange spases in PROPN
    sent_coded_upd = []
    for word in sent_coded:
        if word in mapping_rv_names.keys():
            sent_coded_upd.append(mapping_rv_names[word])
        elif word[-4:] not in ['_ADP', '_DET', '_SYM'] and word[-5:] not in ['_PART', '_PRON', 
                            'CCONJ', 'SCONJ'] and word[:word.index('_')] not in stop_words:
            sent_coded_upd.append(word.lstrip('-'))
    udp_sentence_two.append(sent_coded_upd) 
    for word in sent_coded_upd:
        udp_set_words.add(word)

In [35]:
intersect_mydata_rusvect = set.intersection(set(model_w2v_rv.vocab.keys()), udp_set_words)
print("Количество пересечений ", len(intersect_mydata_rusvect))

difference = udp_set_words - set(model_w2v_rv.vocab.keys())
print("Количество слов, котрых нет в модели ", len(difference))

Количество пересечений  8212
Количество слов, котрых нет в модели  437


In [36]:
global_count = 0
ids_to_delete = []

udp_sentence_one_for_mod = []
udp_sentence_two_for_mod = []

ids_to_take = [i for i in range(len(udp_sentence_one))]
i = 0

for sent_1, sent_2 in zip(udp_sentence_one, udp_sentence_two):
    count = 0
    for word in sent_1:
        if word in difference:
            count += 1
    for word in sent_2:
        if word in difference:
            count += 1
    if count > 0:
#         print(sent_1, sent_2)
        global_count += 1
#         print(udp_sentence_one.index(sent_1))
        ids_to_take.remove(i)
    else:
        udp_sentence_one_for_mod.append(sent_1)
        udp_sentence_two_for_mod.append(sent_2)
    i += 1

print("Количество пар предложений, выпадающих из-за отсутствия слова в модели ", global_count)
print("Общее количество пар ", len(udp_sentence_one))

Количество пар предложений, выпадающих из-за отсутствия слова в модели  833
Общее количество пар  4270


In [37]:
len(udp_sentence_one_for_mod)

3437

In [38]:
tf_idf_udp_sentence_one = compute_tfidf(udp_sentence_one_for_mod)
tf_idf_udp_sentence_two = compute_tfidf(udp_sentence_two_for_mod)

In [39]:
udp_vec_sentence_one = np.zeros((len(udp_sentence_one_for_mod), 300))
udp_vec_sentence_two = np.zeros((len(udp_sentence_one_for_mod), 300))

for sent_1, sent_2, tfidf_one, tfidf_two, row_id in zip(udp_sentence_one_for_mod, udp_sentence_two_for_mod, tf_idf_udp_sentence_one, tf_idf_udp_sentence_two, range(len(udp_sentence_two_for_mod))): 
    
    udp_vec_sentence_one[row_id] = sum(tfidf_one[word] * model_w2v_rv[word] / sqrt(sum(i**2 for i in model_w2v_rv[word])) for word in sent_1)
    udp_vec_sentence_two[row_id] = sum(tfidf_two[word] * model_w2v_rv[word] / sqrt(sum(i**2 for i in model_w2v_rv[word])) for word in sent_2)

In [40]:
udp_vec_sentence_one = np.zeros((len(udp_sentence_one_for_mod), 300))
udp_vec_sentence_two = np.zeros((len(udp_sentence_one_for_mod), 300))

for sent_1, sent_2, tfidf_one, tfidf_two, row_id in zip(udp_sentence_one_for_mod, udp_sentence_two_for_mod, tf_idf_udp_sentence_one, tf_idf_udp_sentence_two, range(len(udp_sentence_two_for_mod))): 
    
    udp_vec_sentence_one[row_id] = sum(tfidf_one[word] * model_w2v_rv[word] / sqrt(sum(i**2 for i in model_w2v_rv[word])) for word in sent_1)
    udp_vec_sentence_two[row_id] = sum(tfidf_two[word] * model_w2v_rv[word] / sqrt(sum(i**2 for i in model_w2v_rv[word])) for word in sent_2)

In [41]:
X_rv = np.hstack((udp_vec_sentence_one, udp_vec_sentence_two))
X_rv.shape

(3437, 600)

In [42]:
y_rv = np.array([int(i) for i in xml_df_fltr.loc[xml_df_fltr.index[ids_to_take],'class']])

In [43]:
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X_rv, y_rv, test_size=0.3, random_state=0)
os_data_X, os_data_y = os.fit_sample(X_train, y_train)
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Proportion of no subscription data in oversampled data is ", np.sum(os_data_y) / len(os_data_X))
print("Proportion of subscription data in oversampled data is ", (1 - np.sum(os_data_y)) / len(os_data_X))

length of oversampled data is  2832
Proportion of no subscription data in oversampled data is  0.0
Proportion of subscription data in oversampled data is  0.00035310734463276836


In [44]:
logreg = LogisticRegression()
logreg.fit(os_data_X, os_data_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [45]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.65


In [46]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.72      0.65      0.68       595
           1       0.58      0.65      0.61       437

    accuracy                           0.65      1032
   macro avg       0.65      0.65      0.65      1032
weighted avg       0.66      0.65      0.65      1032

