In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = """
Dave watched as the forest burned up on the hill,
only a few miles from his house. The car had
been hastily packed and Marta was inside trying to round
up the last of the pets. "Where could she be?" he wondered
as he continued to wait for Marta to appear with the pets.
"""

doc = nlp(text)
token_list = [token for token in doc]

print(token_list)

[
, Dave, watched, as, the, forest, burned, up, on, the, hill, ,, 
, only, a, few, miles, from, his, house, ., The, car, had, 
, been, hastily, packed, and, Marta, was, inside, trying, to, round, 
, up, the, last, of, the, pets, ., ", Where, could, she, be, ?, ", he, wondered, 
, as, he, continued, to, wait, for, Marta, to, appear, with, the, pets, ., 
]


In [2]:
filtered_tokens = [token for token in doc if not token.is_stop]
print(filtered_tokens)

[
, Dave, watched, forest, burned, hill, ,, 
, miles, house, ., car, 
, hastily, packed, Marta, inside, trying, round, 
, pets, ., ", ?, ", wondered, 
, continued, wait, Marta, appear, pets, ., 
]


In [3]:
lemmas = [
    f"Token: {token}, lemma: {token.lemma_}"
    for token in filtered_tokens
]

print(lemmas)

['Token: \n, lemma: \n', 'Token: Dave, lemma: Dave', 'Token: watched, lemma: watch', 'Token: forest, lemma: forest', 'Token: burned, lemma: burn', 'Token: hill, lemma: hill', 'Token: ,, lemma: ,', 'Token: \n, lemma: \n', 'Token: miles, lemma: mile', 'Token: house, lemma: house', 'Token: ., lemma: .', 'Token: car, lemma: car', 'Token: \n, lemma: \n', 'Token: hastily, lemma: hastily', 'Token: packed, lemma: pack', 'Token: Marta, lemma: Marta', 'Token: inside, lemma: inside', 'Token: trying, lemma: try', 'Token: round, lemma: round', 'Token: \n, lemma: \n', 'Token: pets, lemma: pet', 'Token: ., lemma: .', 'Token: ", lemma: "', 'Token: ?, lemma: ?', 'Token: ", lemma: "', 'Token: wondered, lemma: wonder', 'Token: \n, lemma: \n', 'Token: continued, lemma: continue', 'Token: wait, lemma: wait', 'Token: Marta, lemma: Marta', 'Token: appear, lemma: appear', 'Token: pets, lemma: pet', 'Token: ., lemma: .', 'Token: \n, lemma: \n']


In [4]:
filtered_tokens[1].vector

array([ 1.8371646 ,  1.4529226 , -1.6147211 ,  0.678362  , -0.6594443 ,
        1.6417935 ,  0.5796405 ,  2.3021278 , -0.13260496,  0.5750932 ,
        1.5654886 , -0.6938864 , -0.59607106, -1.5377437 ,  1.9425622 ,
       -2.4552505 ,  1.2321601 ,  1.0434952 , -1.5102385 , -0.5787632 ,
        0.12055647,  3.6501784 ,  2.6160972 , -0.5710199 , -1.5221789 ,
        0.00629176,  0.22760668, -1.922073  , -1.6252862 , -4.226225  ,
       -3.495663  , -3.312053  ,  0.81387717, -0.00677544, -0.11603224,
        1.4620426 ,  3.0751472 ,  0.35958546, -0.22527039, -2.743926  ,
        1.269633  ,  4.606786  ,  0.34034157, -2.1272311 ,  1.2619178 ,
       -4.209798  ,  5.452852  ,  1.6940253 , -2.5972986 ,  0.95049495,
       -1.910578  , -2.374927  , -1.4227567 , -2.2528825 , -1.799806  ,
        1.607501  ,  2.9914255 ,  2.8065152 , -1.2510269 , -0.54964066,
       -0.49980402, -1.3882618 , -0.470479  , -2.9670253 ,  1.7884955 ,
        4.5282774 , -1.2602427 , -0.14885521,  1.0419178 , -0.08

In [5]:
import tarfile

fname = 'aclImdb_v1.tar.gz'
with tarfile.open(fname, "r:gz") as tar:
    tar.extractall()
    tar.close()

In [6]:
import os
import random                                  

def load_training_data(
    data_directory: str = "aclImdb/train",
    split: float = 0.8,
    limit: int = 0
) -> tuple:
    # Загрузка данных из файлов
    reviews = []
    for label in ["pos", "neg"]:
        labeled_directory = f"{data_directory}/{label}"
        for review in os.listdir(labeled_directory):
            if review.endswith(".txt"):
                with open(f"{labeled_directory}/{review}") as f:
                    text = f.read()
                    text = text.replace("<br />", "\n\n")
                    if text.strip():
                        spacy_label = {
                            "cats": {
                                "pos": "pos" == label,
                                "neg": "neg" == label}
                        }
                        reviews.append((text, spacy_label))
    random.shuffle(reviews)                    

    if limit:                                  
        reviews = reviews[:limit]              
    split = int(len(reviews) * split)          
    return reviews[:split], reviews[split:]    

In [7]:
load_training_data(
    data_directory = "aclImdb/train",
    split = 0.8,
    limit = 0)[0][0]

("This movie really kicked some ass. I watched it over and over and it never got boring. Angelina Jolie really kicked some ass in the movie, you should see the movie, you won't be disappointed. And another reason you should see the movie is because the guy from The X-Files is in it, David Duchovny.",
 {'cats': {'pos': True, 'neg': False}})

In [8]:
load_training_data(
    data_directory = "aclImdb/train",
    split = 0.8,
    limit = 0)[0][1]

("I'm basing this on my observations of one episode I saw last night (9/27/06). I don't think I'll be watching again. The acting was totally wooden, the plot completely predictable, the ending totally unrealistic -- I mean who would believe a 30 million dollar judgment for the death of a recovering drug addict with terminal cancer? The lead actor (Victor Garber) seemed so uncomfortable, almost embarrassed in his role -- perhaps he realized how bad the writing was!! I fully realize that the drama offered this season is pretty poor, but they can surely find better writers. Maybe they are outsourcing the writing to India or China!! I'll bet we won't be seeing this one next season!",
 {'cats': {'pos': False, 'neg': True}})

In [9]:
def evaluate_model(tokenizer, textcat, test_data: list) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)
    # Указываем TP как малое число, чтобы в знаменателе не оказался 0
    TP, FP, TN, FN = 1e-8, 0, 0, 0
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]['cats']
        score_pos = review.cats['pos'] 
        if true_label['pos']:
            if score_pos >= 0.5:
                TP += 1
            else:
                FN += 1
        else:
            if score_pos >= 0.5:
                FP += 1
            else:
                TN += 1    
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f_score = 2 * precision * recall / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}

In [10]:
import os
import random
import spacy
from spacy.util import minibatch, compounding

def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 20) -> None:
    # Строим конвейер
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

    # Обучаем только textcat
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]
    with nlp.disable_pipes(training_excluded_pipes):
        optimizer = nlp.begin_training()
        # Training loop
        print("Начинаем обучение")
        print("Loss\t\tPrec.\tRec.\tF-score")          
        batch_sizes = compounding(
            4.0, 32.0, 1.001
        )  # Генератор бесконечной последовательности входных чисел
        for i in range(iterations):
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(
                    text,
                    labels,
                    drop=0.2,
                    sgd=optimizer,
                    losses=loss
                )
            with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(   
                    tokenizer=nlp.tokenizer,           
                    textcat=textcat,                   
                    test_data=test_data                
                )                                      
                print(f"{loss['textcat']:9.6f}\t\
{evaluation_results['precision']:.3f}\t\
{evaluation_results['recall']:.3f}\t\
{evaluation_results['f-score']:.3f}")
                
    # Сохраняем модель                                 
    with nlp.use_params(optimizer.averages):           
        nlp.to_disk("model_artifacts")                 

In [11]:
train, test = load_training_data(limit=20000)
train_model(train, test, iterations=10)

Начинаем обучение
Loss		Prec.	Rec.	F-score
15.666554	0.849	0.806	0.827
 0.178817	0.864	0.820	0.841
 0.082166	0.861	0.838	0.849
 0.069845	0.867	0.843	0.855
 0.060915	0.872	0.849	0.860
 0.051639	0.871	0.855	0.863
 0.045242	0.876	0.856	0.866
 0.038612	0.878	0.864	0.871
 0.037759	0.873	0.867	0.870
 0.029478	0.876	0.869	0.873


In [12]:
TEST_REVIEW = """
Transcendently beautiful in moments outside the office, it seems almost
sitcom-like in those scenes. When Toni Colette walks out and ponders
life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
whether it's slapstick, farce, magical realism, or drama, but the best of it
doesn't matter. (The worst is sort of tedious - like Office Space with less humor.)
"""

In [13]:
def test_model(input_data: str):
    # Загружаем сохраненную модель
    loaded_model = spacy.load("model_artifacts")
    parsed_text = loaded_model(input_data)
    
    # Определяем возвращаемое предсказание
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "Положительный отзыв"
        score = parsed_text.cats["pos"]
    else:
        prediction = "Негативный отзыв"
        score = parsed_text.cats["neg"]
    print(f"Текст обзора: {input_data}\n\
Предсказание: {prediction}\n\
Score: {score:.3f}")

In [14]:
test_model(input_data=TEST_REVIEW)

Текст обзора: 
Transcendently beautiful in moments outside the office, it seems almost
sitcom-like in those scenes. When Toni Colette walks out and ponders
life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
whether it's slapstick, farce, magical realism, or drama, but the best of it
doesn't matter. (The worst is sort of tedious - like Office Space with less humor.)

Предсказание: Положительный отзыв
Score: 0.909
