In [1]:
%pip install spacy==3.6.1
%pip install ml-datasets
!python -m spacy download en_core_web_md

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting en-core-web-md==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.6.0/en_core_web_md-3.6.0-py3-none-any.whl (42.8 MB)
     ---------------------------------------- 0.0/42.8 MB ? eta -:--:--
     --------------------------------------- 0.0/42.8 MB 660.6 kB/s eta 0:01:05
     --------------------------------------- 0.1/42.8 MB 656.4 kB/s eta 0:01:06
     --------------------------------------- 0.1/42.8 MB 939.4 kB/s eta 0:00:46
     ---------------------------------------- 0.2/42.8 MB 1.3 MB/s eta 0:00:34
     ---------------------------------------- 0.3/42.8 MB 1.2 MB/s eta 0:00:35
     ---------------------------------------- 0.4/42.8 MB 1.5 MB/s eta 0:00:29
     ---------------------------------------- 0.5/42.8 MB 1.6 MB/s eta 0:00:27
      --------------------------------------- 0.7/42.8 MB 1.8 MB

In [2]:
import spacy
from tqdm.auto import tqdm
from spacy.tokens import DocBin
from ml_datasets import imdb
train_data, valid_data = imdb() # отзывы на фильм
nlp = spacy.load("en_core_web_md") # может занять 5 минут

In [3]:
# Пример из датасета
train_data[0]

('Roommates Sugar and Bobby Lee are abducted by menacing dudes while out shopping one day and taken back to a secluded island that the girls reluctantly tell the thugs that they last visited when they were ten years of age and that a fortune is located on. All that just pretty much bookends a movie that is pretty much one long flashback about the girls first visit to the island and subsequent fight with a cannibalistic family.\n\n\n\nThis one is extremely horribly acted by everyone involved to the point that I started feeling bad for poor Hank Worden who truly deserved much MUCH better. As much as I didn\'t like "Barracuda" (that\'s on the same DVD) I have to admit that this film makes that one look like Citizen Kane.\n\n\n\nEye Candy: one pair of tits (they might belong to Kirsten Baker) \n\n\n\nMy Grade: F \n\n\n\nDark Sky DVD Extras: Vintage ads for various drive-in food; and Trailers for "Bonnie\'s Kids" (features nudity), "the Centerfold Girls", "Part-time Wife" (features nudity),

In [4]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [5]:
nlp.pipe(train_data[0], as_tuples=True)

<generator object Language.pipe at 0x000001734750AC00>

In [6]:
def make_docs(data):
    """
    this will take a list of texts and labels
    and transform them in spacy documents
    data: list(tuple(text, label))
    returns: List(spacy.Doc.doc)
    """
    docs = []
    # nlp.pipe([texts]) is way faster than running
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple,
    # the first one is treated as text
    # the second one will get returned as it is.
    # a = tqdm(nlp.pipe(data, as_tuples=True), total = len(data))
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        # One Hot Encodding
        if label == 'neg':
            doc.cats["positive"] = 0
            doc.cats["negative"] = 1
        else:
            doc.cats["positive"] = 1
            doc.cats["negative"] = 0
        docs.append(doc)
    return docs

In [7]:
# это для примера, можно взять больше количество текстов
num_texts = 1000
train_docs = make_docs(train_data[:num_texts])

  0%|          | 0/1000 [00:00<?, ?it/s]

In [8]:
# then we save it in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("train.spacy")
# repeat for validation data
valid_docs = make_docs(valid_data[:num_texts])
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("valid.spacy")

  0%|          | 0/1000 [00:00<?, ?it/s]

На этом месте мы идём в https://spacy.io/usage/training#quickstart, там настраиваем под себя конфиг в разделе *quickstart*, копируем его руками(!) в base_config.cfg, указываем правильные пути до трейн и вэлид, но после этого некоторые поля, возможно, придётся руками дописать. Главное, проверить, что есть вот такие строчки:

[nlp]<br />
lang = "en"<br />
pipeline = ["textcat"]<br />

batch_size = 1000

[components]

[components.textcat]<br />
factory = "textcat"


или можно просто взять мой приложенный, он рабочий

In [9]:
# что здесь происходит: мы заполнили base_config, а эта команда на его основе прописывает config.cfg
! python -m spacy init fill-config base_config.cfg config.cfg
# после запуска этой команды нужно уменьшить max_steps и прописать пути train и dev

In [10]:
# тренируем модель
! python -m spacy train config.cfg --output ./output
# данных мало, сильно на метрики внимания не обращаем

[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.25       32.30    0.32
  0     200         54.26       53.49    0.53
  0     400         39.80       74.71    0.75
  0     600         42.21       78.50    0.78
  0     800         29.28       78.77    0.79
  1    1000         21.25       71.08    0.71
  1    1200          3.63       78.68    0.79
  1    1400          3.71       80.58    0.81
  1    1600          3.47       79.70    0.80
  1    1800          2.18       77.06    0.77
  2    2000          4.83       77.83    0.78
[38;5;2m✔ Saved pipeline to output directory[0m
output\model-last


In [11]:
# загружаем лучшую модель
nlp = spacy.load("output/model-best")
text = ""
print("type : ‘quit’ to exit")
# predict the sentiment until someone writes quit
while True:
    text = input("Please enter example input: ") # не обязательно в начальной форме
    if text == "quit":
        break
    print(text)
    doc = nlp(text)
    print(doc.cats)
    if doc.cats['positive'] >.5:
        print(f"the sentiment is positive")
    else:
        print(f"the sentiment is negative")
        
# значения в пределах 0.5, модель не сильно уверена, нужно больше данных

type : ‘quit’ to exit
nice
{'positive': 0.5492357611656189, 'negative': 0.4507642388343811}
the sentiment is positive
bad review
{'positive': 0.3284081816673279, 'negative': 0.6715918183326721}
the sentiment is negative
bad
{'positive': 0.3714073896408081, 'negative': 0.6285925507545471}
the sentiment is negative


In [12]:
nlp('good review').cats

{'positive': 0.4794910252094269, 'negative': 0.5205089449882507}