In [1]:
import re
import numpy as np
import pandas as pd
import ufal.udpipe
from model import Model
import conllu
from collections import defaultdict
from nltk.parse import DependencyGraph
from tqdm.auto import tqdm

In [129]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

In [104]:
model = Model('english-ewt-ud-2.5-191206.udpipe')

In [105]:
def get_conllu(model, text):
    sentences = model.tokenize(text)
    for s in sentences:
        model.tag(s)
        model.parse(s)
    conllu_text = model.write(sentences, "conllu")
    return conllu_text

In [106]:
def get_dep_tree(text):
    trees = []
    for sent in text.split('\n\n'):
        tree = [line for line in sent.split('\n') if line and line[0] != '#']
        trees.append('\n'.join(tree))
    return trees

In [107]:
def prep(text):
    text = (re.sub('\n', '', text)).strip()
    return text

In [108]:
def split(text):
    return text.split(' . ')

In [109]:
sentence = 'it is never mentioned but the gravity on mars has been increased somehow'

In [122]:
con = get_conllu(model, sentence)
print(con)

# newdoc
# newpar
# sent_id = 1
# text = it is never mentioned but the gravity on mars has been increased somehow
1	it	it	PRON	PRP	Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs	4	nsubj:pass	_	_
2	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	4	aux:pass	_	_
3	never	never	ADV	RB	_	4	advmod	_	_
4	mentioned	mention	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	_	_
5	but	but	CCONJ	CC	_	12	cc	_	_
6	the	the	DET	DT	Definite=Def|PronType=Art	7	det	_	_
7	gravity	gravity	NOUN	NN	Number=Sing	12	nsubj:pass	_	_
8	on	on	ADP	IN	_	9	case	_	_
9	mars	mar	NOUN	NNS	Number=Plur	7	nmod	_	_
10	has	have	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	12	aux	_	_
11	been	be	AUX	VBN	Tense=Past|VerbForm=Part	12	aux:pass	_	_
12	increased	increase	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	4	conj	_	_
13	somehow	somehow	ADV	RB	_	12	advmod	_	SpaceAfter=No




In [124]:
prep = get_dep_tree(con)
prep

['1\tit\tit\tPRON\tPRP\tCase=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs\t4\tnsubj:pass\t_\t_\n2\tis\tbe\tAUX\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t4\taux:pass\t_\t_\n3\tnever\tnever\tADV\tRB\t_\t4\tadvmod\t_\t_\n4\tmentioned\tmention\tVERB\tVBN\tTense=Past|VerbForm=Part|Voice=Pass\t0\troot\t_\t_\n5\tbut\tbut\tCCONJ\tCC\t_\t12\tcc\t_\t_\n6\tthe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t7\tdet\t_\t_\n7\tgravity\tgravity\tNOUN\tNN\tNumber=Sing\t12\tnsubj:pass\t_\t_\n8\ton\ton\tADP\tIN\t_\t9\tcase\t_\t_\n9\tmars\tmar\tNOUN\tNNS\tNumber=Plur\t7\tnmod\t_\t_\n10\thas\thave\tAUX\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t12\taux\t_\t_\n11\tbeen\tbe\tAUX\tVBN\tTense=Past|VerbForm=Part\t12\taux:pass\t_\t_\n12\tincreased\tincrease\tVERB\tVBN\tTense=Past|VerbForm=Part|Voice=Pass\t4\tconj\t_\t_\n13\tsomehow\tsomehow\tADV\tRB\t_\t12\tadvmod\t_\tSpaceAfter=No',
 '']

In [125]:
d = DependencyGraph(prep[0])
d.root = d.nodes[0]
nodes = list(d.triples())
nodes

[((None, 'TOP'), 'root', ('mentioned', 'VERB')),
 (('mentioned', 'VERB'), 'nsubj:pass', ('it', 'PRON')),
 (('mentioned', 'VERB'), 'aux:pass', ('is', 'AUX')),
 (('mentioned', 'VERB'), 'advmod', ('never', 'ADV')),
 (('mentioned', 'VERB'), 'conj', ('increased', 'VERB')),
 (('increased', 'VERB'), 'cc', ('but', 'CCONJ')),
 (('increased', 'VERB'), 'nsubj:pass', ('gravity', 'NOUN')),
 (('gravity', 'NOUN'), 'det', ('the', 'DET')),
 (('gravity', 'NOUN'), 'nmod', ('mars', 'NOUN')),
 (('mars', 'NOUN'), 'case', ('on', 'ADP')),
 (('increased', 'VERB'), 'aux', ('has', 'AUX')),
 (('increased', 'VERB'), 'aux:pass', ('been', 'AUX')),
 (('increased', 'VERB'), 'advmod', ('somehow', 'ADV'))]

In [127]:
for i in range(1, len(nodes)):
    print(str(nodes[i][0][0]) + str(nodes[i][2][0]))

mentionedit
mentionedis
mentionednever
mentionedincreased
increasedbut
increasedgravity
gravitythe
gravitymars
marson
increasedhas
increasedbeen
increasedsomehow


In [11]:
with open('2000english.txt', 'r', encoding='utf-8') as f:
    file = f.readlines()

In [12]:
len(file)

2000

In [14]:
file = [prep(i) for i in file]
file = [split(i) for i in file]

In [16]:
%%time

pink = [] #список списков, для столбца - розовый
blue = [] #единый список для строк - голубой 
for review in tqdm(file):
    sentences = []
    for sentence in review:
        con = get_conllu(model, sentence)
        prep = get_dep_tree(con)
        d = DependencyGraph(prep[0])
        d.root = d.nodes[0]
        nodes = list(d.triples())
        for i in range(1, len(nodes)):
            bigram = (str(nodes[i][0][0]) + str(nodes[i][2][0]))
            blue.append(bigram)
            sentences.append(bigram)
    pink.append(sentences)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))

  "The graph doesn't contain a node " "that depends on the root element."



Wall time: 27min 26s


In [17]:
%%time
with open('2000test.txt', 'w', encoding='utf-8') as f:
    for review in pink:
        print(*review, file=f)

Wall time: 885 ms


In [18]:
with open('2000test.txt', 'r', encoding='utf-8') as f:
    text = f.readlines()

In [19]:
text = [prep(i) for i in text]

In [20]:
df = pd.DataFrame()
df['text'] = text
labels = np.array([0] * (len(text)//2) + [1] * (len(text)//2))
df['labels'] = labels
df = shuffle(df)

In [21]:
vectorizer = TfidfVectorizer(max_features=100000)
X = vectorizer.fit_transform(df["text"])
y = df['labels']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

##### LogisticRegression

In [23]:
log = LogisticRegression()
log.fit(X_train, y_train)
y_train_pred = log.predict(X_train)
y_test_pred = log.predict(X_test)



In [24]:
precision_score(y_test, y_test_pred), recall_score(y_test, y_test_pred), f1_score(y_test, y_test_pred)

(0.8349514563106796, 0.86, 0.8472906403940887)

In [25]:
accuracy_score(y_test, y_test_pred, normalize=True, sample_weight=None)

0.845

precision_score, recall_score, f1_score

In [61]:
(cross_val_score(log, X, y, cv=10, scoring='precision').mean(), cross_val_score(log, X, y, cv=10, scoring='recall').mean(), cross_val_score(log, X, y, cv=10, scoring='f1').mean())



(0.8282492715071557, 0.8200000000000001, 0.8237976099684964)

accuracy_score

In [101]:
cross_val_score(log, X, y, cv=10).mean()



0.8244999999999999

#### KNN

In [100]:
knn = KNeighborsClassifier()

precision_score, recall_score, f1_score

In [81]:
(cross_val_score(knn, X, y, cv=10, scoring='precision').mean(), cross_val_score(knn, X, y, cv=10, scoring='recall').mean(), cross_val_score(knn, X, y, cv=10, scoring='f1').mean())

(0.6888589334577266, 0.719, 0.7032323525955595)

accuracy_score

In [97]:
cross_val_score(knn, X, y, cv=10).mean()

0.6965

#### MultinomialNB

In [99]:
clf = MultinomialNB()

precision_score, recall_score, f1_score

In [96]:
(cross_val_score(clf, X, y, cv=10, scoring='precision').mean(), cross_val_score(clf, X, y, cv=10, scoring='recall').mean(), cross_val_score(clf, X, y, cv=10, scoring='f1').mean())

(0.8401053230486397, 0.8400000000000001, 0.8397005047073824)

accuracy_score

In [98]:
cross_val_score(clf, X, y, cv=10).mean()

0.8394999999999999