# Курсовая работа


In [1]:
import numpy as np

import nltk
from pymystem3 import Mystem

import ngrammer

import re

import pandas as pd

import pickle as pkl
import os
import time
from tqdm import tqdm

In [2]:
SYNTAXNET_INPUT = "sentences.txt"
SYNTAXNET_OUTPUT = "syntaxnet_out.txt"

LEMMATIZED_COLLECTION = "../lemmatized_collection/"
STOPWORDS_FILE = "../stopwords.txt"
NGRAMMS_FILE = "../ngramms.txt"

<a id='dataset_collect'></a>
# Сборка датасета

In [3]:
WIKI_XML = "./Data/Input/wiki.xml"

<a id='collocations_collect'></a>
## Коллокации

In [4]:
COLLOCS_FILE = "./Data/Input/gt_collocs.txt"

In [255]:
! python extract_collocations.py {WIKI_XML} -o {COLLOCS_FILE} --min 2 --max 4

Extracting collocations from hyperlinks...
Done!

Stats:
hyperlinks:	9200
filtered hl:	11375
collocations:	3651


In [65]:
with open(COLLOCS_FILE, "r") as f:
    collocs = f.read().split('\n')

In [66]:
collocs[:5]

['computer scientist',
 'mikhail moiseevich bongard',
 'pattern recognition',
 'gdel escher bach',
 'douglas hofstadter']

<a id='documents_collect'></a>
## Документы

**Документы получаем с помощью [wikiextractor](https://github.com/attardi/wikiextractor)**

In [5]:
COLLECTION_FOLDER = "./Data/Input/collection/"
WIKIEXTRACTOR = "./wikiextractor/WikiExtractor.py"

In [78]:
! python extract_documents.py {WIKI_XML} -o {COLLECTION_FOLDER} --wikiextractor {WIKIEXTRACTOR}

Extracting texts...
Done!

Stats:
documents:	198


<a id='syntaxnet'></a>
# SyntaxNet

<a id='syntaxnet_run'></a>
## Запуск SyntaxNet
docker взял [здесь](https://hub.docker.com/r/inemo/syntaxnet_eng/)

In [6]:
SYNTAXNET_OUTPUT = "./Data/Component_Results/syntaxnet/syntaxnet_out.txt"
SYNTAXNET_RESULT = "./Data/Component_Results/syntaxnet/results.csv"
LEMMATIZED_COLLECTION = "./Data/Component_Results/syntaxnet/lemmatized_collection"

In [307]:
cmd = ' '.join(['python', 'run_syntaxnet.py', COLLECTION_FOLDER,
#                 '-o', SYNTAXNET_RESULT,
                '--syntaxnet_ready', SYNTAXNET_OUTPUT,
                '--lemmatize', LEMMATIZED_COLLECTION,
                '--syntaxnet_out', SYNTAXNET_OUTPUT])
!{cmd}

Preprocessing documents...
We already have syntaxnet output
Postprocessing syntaxnet results...
Saving syntaxnet output
Saving lemmatized collection...
Done!

Stats:
sentences:	7704


<a id='topmine'></a>
# TopMine

In [9]:
COLLOCS_TOPMINE = "./Data/Output/topmine_collocs.txt"
TOPMINE_OUTPUT = "./Data/Component_Results/topmine/topmine.csv"
STOPWORDS_FILE = "./Data/Input/stopwords.txt"

In [501]:
cmd = ' '.join(['python', 'run_topmine.py', LEMMATIZED_COLLECTION,
                '-o', TOPMINE_OUTPUT,
                '--collocations_output', COLLOCS_TOPMINE,
                '--stopwords', STOPWORDS_FILE,
                '--threshold', '1.2']) # 1.8
!{cmd}

Running TopMine...
Collecting features...
Done!

Stats:
stopwords:			175
ngramms with features:		29722
extracted unique ngramms:	5179


<a id="syntaxnet_postprocess"></a>
## Обработка результатов SyntaxNet'а

In [7]:
SYNTAXNET_DISTANCES = "./Data/Component_Results/syntaxnet/syntaxnet.tsv"

In [11]:
cmd = ' '.join(['python', 'process_syntaxnet.py', SYNTAXNET_OUTPUT,
                '-o', SYNTAXNET_DISTANCES,
                '--lemmatized_collection', LEMMATIZED_COLLECTION,
                '--topmine_data', TOPMINE_OUTPUT])
!{cmd}

Processing Syntaxnet output...
100%|██████████████████████████████████████▉| 7699/7704 [40:02<00:01,  3.20it/s]


---

Trash

In [492]:
with open('./Data/Output/topmine_collocs.txt', 'r') as f:
    topmine_collocs = f.read().split('\n')

In [493]:
collocs = sorted(collocs)
topmine_collocs = sorted(topmine_collocs)

In [494]:
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatized_collocs = []
for colloc in collocs:
    lemmatized_colloc = []
    for word in colloc.split(' '):
        lemma = lemmatizer.lemmatize(word)
        lemmatized_colloc.append(lemma)
    lemmatized_collocs.append(' '.join(lemmatized_colloc))

In [495]:
score = 0
for test_colloc in topmine_collocs:
    if test_colloc in collocs:
        score += 1

In [496]:
score

933

In [497]:
len(topmine_collocs)

22075

In [498]:
len(collocs)

3646

In [499]:
recall = score / len(collocs)
print('recall:\t\t', recall)

recall:		 0.25589687328579264


In [500]:
precision = score / len(topmine_collocs)
print('precision:\t', precision)

precision:	 0.042265005662514156
