In [1]:
%load_ext autoreload
%autoreload 2

https://webstruct.readthedocs.io/en/latest/tutorial.html

In [2]:
!pip install webstruct >/dev/null
!pip install lxml >/dev/null
!pip install scikit-learn >/dev/null


In [None]:
import webstruct
trees = webstruct.load_trees("train/*.html", webstruct.WebAnnotatorLoader())

In [None]:
list(trees)

In [None]:
html_tokenizer = webstruct.HtmlTokenizer()
X, y = html_tokenizer.tokenize(trees)

def token_identity(html_token):
    return {'token': html_token.token}

def token_isupper(html_token):
    return {'isupper': html_token.token.isupper()}

def parent_tag(html_token):
    return {'parent_tag': html_token.parent.tag}

def border_at_left(html_token):
    return {'border_at_left': html_token.index == 0}

from webstruct.feature_extraction import HtmlFeatureExtractor
feature_extractor = HtmlFeatureExtractor(
    token_features = [
        token_identity,
        token_isupper,
        parent_tag,
        border_at_left
    ]
)

In [None]:
features = feature_extractor.fit_transform(X)

## named entity extraction

https://www.kdnuggets.com/2018/10/named-entity-recognition-classification-scikit-learn.html/2

In [None]:
# https://stackoverflow.com/questions/66059532/attributeerror-crf-object-has-no-attribute-keep-tempfiles

In [1]:
!pip install scikit-learn==0.23.1
!pip install pandas
!pip install sklearn_crfsuite >/dev/null



In [2]:
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('./data/ner_dataset.csv', encoding = "ISO-8859-1")
df = df[:100000]
df.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [4]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(), 
                                                           s['POS'].values.tolist(), 
                                                           s['Tag'].values.tolist())]
        self.grouped = self.data.groupby('Sentence #').apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

getter = SentenceGetter(df)
sentences = getter.sentences

In [5]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [6]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [18]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [26]:
X_train = [
    [dict(a=1,b=4)],
    [dict(a=2,b=3)],
    [dict(a=1,b=3)],
    [dict(a=1,b=2)]
]
y_train = [
    ['a'], ['b'], ['a'], ['a']
]
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [30]:
X_val = [
    [dict(a=1,b=4)],
    [dict(a=2,b=3)],
    [dict(a=-1,b=3)],
    [dict(a=500,b=3)],
    [dict(a=1,b=20)],
]
y_val = [
    ['a'], ['b'], ['a'], ['a']
]

crf.predict(X_val)

[['a'], ['b'], ['a'], ['b'], ['a']]

In [23]:
crf.transition_features_

{}

In [20]:
def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:

Top unlikely transitions:


https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#let-s-use-conll-2002-data-to-build-a-ner-system

### Py depta(Update не получается поставить из за scrapely и gcc error которая не гуглитс)

In [None]:
# !sudo apt-get install python-dev

In [None]:
# !sudo apt-get install -y libxml2-dev libxslt1-dev

In [1]:
# !pip3 install scrapely

In [8]:
!pip install -i https://pypi.anaconda.org/scrapinghub/label/dev/simple pydepta
!pip install git+https://github.com/scrapinghub/pydepta.git

Looking in indexes: https://pypi.anaconda.org/scrapinghub/label/dev/simple
Collecting pydepta
  Downloading https://pypi.anaconda.org/scrapinghub/label/dev/simple/pydepta/0.2.1a0/pydepta-0.2.1a0.tar.gz (134 kB)
[K     |████████████████████████████████| 134 kB 282 kB/s eta 0:00:01
Building wheels for collected packages: pydepta
  Building wheel for pydepta (setup.py) ... [?25ldone
[?25h  Created wheel for pydepta: filename=pydepta-0.2.1a0-cp37-cp37m-linux_x86_64.whl size=88462 sha256=27132a46f042dd91cff32ab95c36ec98710303a50bf6d2c138a695def7c97d34
  Stored in directory: /home/grigory/.cache/pip/wheels/e8/f5/61/d09995d890968352dfbd68f59992ed65dd6a05992cea1415e6
Successfully built pydepta
Installing collected packages: pydepta
Successfully installed pydepta-0.2.1a0
Collecting git+https://github.com/scrapinghub/pydepta.git
  Cloning https://github.com/scrapinghub/pydepta.git to /tmp/pip-req-build-rmcw9c0p
  Running command git clone -q https://github.com/scrapinghub/pydepta.git /tmp/pip

In [12]:
!pip install pydepta



In [14]:
from pydepta import Depta

ModuleNotFoundError: No module named 'depta'

In [33]:
from pydepta import Depta
d = Depta()
url1 = 'http://www.iens.nl/restaurant/12229/nijmegen-pasta-e-fagioli'
seed = d.extract(url=url1)[8]
seed.as_plain_texts()[0]

ModuleNotFoundError: No module named 'pydepta'

### Scrapely ставится на python 3.7.4 с некотрой ошибкой

https://github.com/scrapy/scrapely/issues/112

In [None]:
!pip install scrapely

Collecting scrapely
  Using cached scrapely-0.14.1.tar.gz (155 kB)
Building wheels for collected packages: scrapely
  Building wheel for scrapely (setup.py) ... [?25lerror
[31m  ERROR: Command errored out with exit status 1:
   command: /home/grigory/anaconda3/envs/crf/bin/python -u -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-jbu6yecr/scrapely_92d454a99b994853b8d880216ca8e283/setup.py'"'"'; __file__='"'"'/tmp/pip-install-jbu6yecr/scrapely_92d454a99b994853b8d880216ca8e283/setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d /tmp/pip-wheel-bgnw04yk
       cwd: /tmp/pip-install-jbu6yecr/scrapely_92d454a99b994853b8d880216ca8e283/
  Complete output (534 lines):
  running bdist_wheel
  running build
  running build_py
  c

In [5]:
import scrapely

ModuleNotFoundError: No module named 'scrapely'

In [3]:
from scrapely import Scraper
s = Scraper()


In [17]:
url1 = 'http://pypi.python.org/pypi/w3lib/1.1'
data = {'name': 'w3lib 1.1', 'author': 'Scrapy project', 'description': 'Library of web-related functions'}
s.train(url1, data)

In [20]:
url2 = 'http://pypi.python.org/pypi/Django/1.3'
s.scrape(url2)
# [{u'author': [u'Django Software Foundation &lt;foundation at djangoproject com&gt;'],
#   u'description': [u'A high-level Python Web framework that encourages rapid development and clean, pragmatic design.'],
#   u'name': [u'Django 1.3']}]

[{'author': ['Django Software Foundation &lt;foundation at djangoproject com&gt;'],
  'description': ['A high-level Python Web framework that encourages rapid development and clean, pragmatic design.'],
  'name': ['Django 1.3']}]

In [24]:
!pip install pymongo>/dev/null
!pip install loguru>/dev/null

In [30]:
import sys
sys.path.insert(0, '/home/grigory/personal/items_crawler/')
from src.mongo_conn import MongoConn
import json
from scrapely import best_match


In [37]:
CONFIG_PATH='/home/grigory/personal/items_crawler/configs/config.json'
with open(CONFIG_PATH, 'r') as f:
    config = json.load(f)['mongo']
config['collection'] = 'prod'
mongo_conn = MongoConn(config)
collection = mongo_conn.get_collection()

2021-06-03 20:13:32.724 | DEBUG    | src.mongo_conn:get_collection:13 - Open connection


In [58]:
url = 'https://topliba.com/books/505386'
doc = collection.find_one({'_id': url})
page_body = doc['dynamic']['page_html']

In [86]:
import requests

[l for l in requests.get(url).content.decode('utf-8').split('\n') if '5.00' in l]

['        <rating class="pull-left"  _rating="0" _showAvgRating="true" _showAvgRatingText="true" _avgRating="5.00" _ratingsNumber="1" _user="" _url="/books/505386/ratings"></rating>',
 '        <meta itemprop="ratingValue" content="5.00"/>']

In [30]:
s = Scraper()

url1 = 'https://topliba.com/books/505386'
train_dict = {'title': 'Не родись заклинательницей', 
              'author': 'Марина Котлова',
              'category':['Фэнтези'],
              'pages': '371 стр',
              'rating':'5.00',}
s.train(url1, train_dict)
train_dict = {'title': 'Иванова, на пересдачу!', 
              'author': 'Татьяна Новикова',
              'rating':'4.35',
              'publish_date': '2020'}
url2 = 'https://topliba.com/books/776174'
s.train(url2, train_dict)

train_dict = {'title': 'Лекарки тоже воюют', 
              'author': 'Инга Ветреная',
              'rating':'3.89',
              'category': ['Любовная фантастика', 'Самиздат, сетевая литература'],
              'series': 'col-lgta'
             }
url3 = 'https://topliba.com/books/721277'
s.train(url3, train_dict)

# best_match('kek')

FragmentNotFound: Fragment not found annotating 'series' using: <function best_match.<locals>.func at 0x7f0978a7c4d0>

In [29]:
url2 = 'https://topliba.com/books/505385'
s.scrape(url2)

[{'author': ['Владимир Котельников'],
  'title': ['Транспортный самолет Юнкерс Ju 52/3m']}]