In [1]:
import mindspore;mindspore.run_check()

MindSpore version:  2.0.0rc1.dev20230507
The result of multiplication calculation is correct, MindSpore has been installed on platform [GPU] successfully!


In [2]:
pip install tqdm requests

Looking in indexes: https://repo.huaweicloud.com/repository/pypi/simple
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import shutil
import requests
import tempfile
from tqdm import tqdm
from typing import IO
from pathlib import Path

# Set the storage path to `home_path/.mindspore_examples`.
cache_dir ='./dataset' 
#Path.home() / '.mindspore_examples'

def http_get(url: str, temp_file: IO):
    """Download data by using the requests library and visualize the process by using the tqdm library."""
    req = requests.get(url, stream=True)
    content_length = req.headers.get('Content-Length')
    total = int(content_length) if content_length is not None else None
    progress = tqdm(unit='B', total=total)
    for chunk in req.iter_content(chunk_size=1024):
        if chunk:
            progress.update(len(chunk))
            temp_file.write(chunk)
    progress.close()

def download(file_name: str, url: str):
    """Download data and save it with the specified name."""
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    cache_path = os.path.join(cache_dir, file_name)
    cache_exist = os.path.exists(cache_path)
    if not cache_exist:
        with tempfile.NamedTemporaryFile() as temp_file:
            http_get(url, temp_file)
            temp_file.flush()
            temp_file.seek(0)
            with open(cache_path, 'wb') as cache_file:
                shutil.copyfileobj(temp_file, cache_file)
    return cache_path


In [2]:
imdb_path = download('aclImdb_v1.tar.gz', 'https://mindspore-website.obs.myhuaweicloud.com/notebook/datasets/aclImdb_v1.tar.gz')
imdb_path

'./dataset/aclImdb_v1.tar.gz'

In [3]:
import re
import six
import string
import tarfile

class IMDBData():
    """IMDB dataset loader.

    Load the IMDB dataset and process it as a Python iteration object.

    """
    label_map = {
        "pos": 1,
        "neg": 0
    }
    def __init__(self, path, mode="train"):
        self.mode = mode
        self.path = path
        self.docs, self.labels = [], []

        self._load("pos")
        self._load("neg")

    def _load(self, label):
        pattern = re.compile(r"aclImdb/{}/{}/.*\.txt$".format(self.mode, label))
        # Load data to the memory.
        with tarfile.open(self.path) as tarf:
            tf = tarf.next()
            while tf is not None:
                if bool(pattern.match(tf.name)):
                    # Segment text, remove punctuations and special characters, and convert text to lowercase.
                    self.docs.append(str(tarf.extractfile(tf).read().rstrip(six.b("\n\r"))
                                         .translate(None, six.b(string.punctuation)).lower()).split())
                    self.labels.append([self.label_map[label]])
                tf = tarf.next()

    def __getitem__(self, idx):
        return self.docs[idx], self.labels[idx]

    def __len__(self):
        return len(self.docs)


In [4]:
imdb_train = IMDBData(imdb_path, 'train')
len(imdb_train)

25000

In [5]:
imdb_test = IMDBData(imdb_path, 'test')
len(imdb_test)

25000

In [6]:
imdb_train.__getitem__(1)

(["b'zentropa",
  'is',
  'the',
  'most',
  'original',
  'movie',
  'ive',
  'seen',
  'in',
  'years',
  'if',
  'you',
  'like',
  'unique',
  'thrillers',
  'that',
  'are',
  'influenced',
  'by',
  'film',
  'noir',
  'then',
  'this',
  'is',
  'just',
  'the',
  'right',
  'cure',
  'for',
  'all',
  'of',
  'those',
  'hollywood',
  'summer',
  'blockbusters',
  'clogging',
  'the',
  'theaters',
  'these',
  'days',
  'von',
  'triers',
  'followups',
  'like',
  'breaking',
  'the',
  'waves',
  'have',
  'gotten',
  'more',
  'acclaim',
  'but',
  'this',
  'is',
  'really',
  'his',
  'best',
  'work',
  'it',
  'is',
  'flashy',
  'without',
  'being',
  'distracting',
  'and',
  'offers',
  'the',
  'perfect',
  'combination',
  'of',
  'suspense',
  'and',
  'dark',
  'humor',
  'its',
  'too',
  'bad',
  'he',
  'decided',
  'handheld',
  'cameras',
  'were',
  'the',
  'wave',
  'of',
  'the',
  'future',
  'its',
  'hard',
  'to',
  'say',
  'who',
  'talked',
  'hi

In [7]:
imdb_train.__getitem__(12)

(["b'piece",
  'of',
  'subtle',
  'art',
  'maybe',
  'a',
  'masterpiece',
  'doubtlessly',
  'a',
  'special',
  'story',
  'about',
  'the',
  'ambiguity',
  'of',
  'existence',
  'tale',
  'in',
  'kafka',
  'style',
  'about',
  'impossibility',
  'of',
  'victory',
  'or',
  'surviving',
  'in',
  'a',
  'perpetual',
  'strange',
  'world',
  'the',
  'life',
  'is',
  'in',
  'this',
  'film',
  'only',
  'exercise',
  'of',
  'adaptation',
  'lesson',
  'about',
  'limits',
  'and',
  'original',
  'sin',
  'about',
  'the',
  'frailty',
  'of',
  'innocence',
  'and',
  'error',
  'of',
  'his',
  'waysbr',
  'br',
  'leopold',
  'kessle',
  'is',
  'another',
  'joseph',
  'k',
  'images',
  'of',
  'trial',
  'and',
  'same',
  'ambiguous',
  'woman',
  'and',
  'europa',
  'is',
  'symbol',
  'of',
  'basic',
  'crisis',
  'who',
  'has',
  'many',
  'aspects',
  'like',
  'chimeric',
  'wars',
  'or',
  'unavailing',
  'search',
  'of',
  'truthessencegolden',
  'agebr',

In [8]:
import mindspore.dataset as ds

def load_imdb(imdb_path):
    imdb_train = ds.GeneratorDataset(IMDBData(imdb_path, "train"), column_names=["text", "label"], shuffle=False)
    imdb_test = ds.GeneratorDataset(IMDBData(imdb_path, "test"), column_names=["text", "label"], shuffle=False)
    return imdb_train, imdb_test


In [9]:
imdb_train, imdb_test = load_imdb(imdb_path)
imdb_train
imdb_test

<mindspore.dataset.engine.datasets_user_defined.GeneratorDataset at 0x7ff419189040>

In [10]:
len(imdb_train)

25000

In [11]:
x= next(imdb_train.create_tuple_iterator())
x

[Tensor(shape=[121], dtype=String, value= ["b'zentropa", 'has', 'much', 'in', 'common', 'with', 'the', 'third',
  'man', 'another', 'noirlike', 'film', 'set', 'among', 'the', 'rubble',
  'of', 'postwar', 'europe', 'like', 'ttm', 'there', 'is', 'much',
  'inventive', 'camera', 'work', 'there', 'is', 'an', 'innocent',
  'american', 'who', 'gets', 'emotionally', 'involved', 'with', 'a',
  'woman', 'he', 'doesnt', 'really', 'understand', 'and', 'whose',
  'naivety', 'is', 'all', 'the', 'more', 'striking', 'in', 'contrast',
  'with', 'the', 'nativesbr', 'br', 'but', 'id', 'have', 'to', 'say',
  'that', 'the', 'third', 'man', 'has', 'a', 'more', 'wellcrafted',
  'storyline', 'zentropa', 'is', 'a', 'bit', 'disjointed', 'in', 'this',
  'respect', 'perhaps', 'this', 'is', 'intentional', 'it', 'is',
  'presented', 'as', 'a', 'dreamnightmare', 'and', 'making', 'it', 'too',
  'coherent', 'would', 'spoil', 'the', 'effect', 'br', 'br', 'this',
  'movie', 'is', 'unrelentingly', 'grimnoir', 'in', 'mor

In [12]:
import zipfile
import numpy as np

def load_glove(glove_path):
    glove_100d_path = os.path.join(cache_dir, 'glove.6B.100d.txt')
    if not os.path.exists(glove_100d_path):
        glove_zip = zipfile.ZipFile(glove_path)
        glove_zip.extractall(cache_dir)

    embeddings = []
    tokens = []
    with open(glove_100d_path, encoding='utf-8') as gf:
        for glove in gf:
            word, embedding = glove.split(maxsplit=1)
            tokens.append(word)
            embeddings.append(np.fromstring(embedding, dtype=np.float32, sep=' '))
    # Add the embeddings corresponding to the special placeholders <unk> and <pad>.
    embeddings.append(np.random.rand(100))
    embeddings.append(np.zeros((100,), np.float32))

    vocab = ds.text.Vocab.from_list(tokens, special_tokens=["<unk>", "<pad>"], special_first=False)
    embeddings = np.array(embeddings).astype(np.float32)
    return vocab, embeddings


In [13]:
glove_path = download('glove.6B.zip', 'https://mindspore-website.obs.myhuaweicloud.com/notebook/datasets/glove.6B.zip')
vocab, embeddings = load_glove(glove_path)
len(vocab.vocab())


400002

In [14]:
len(vocab.vocab())
#len(embeddings.embeddings())

400002

In [15]:
idx = vocab.tokens_to_ids('the')
embedding = embeddings[idx]
idx, embedding


(0,
 array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
        -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
         0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
        -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
         0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
        -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
         0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
         0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
        -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
        -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
        -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
        -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
        -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
        -1.2526  ,  0.071624,  0.7

In [16]:
idx = vocab.tokens_to_ids('<pad>')
embedding = embeddings[idx]
idx, embedding

(400001,
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       dtype=float32))

In [17]:
import mindspore as ms

lookup_op = ds.text.Lookup(vocab, unknown_token='<unk>')
pad_op = ds.transforms.PadEnd([500], pad_value=vocab.tokens_to_ids('<pad>'))
type_cast_op = ds.transforms.TypeCast(ms.float32)


In [18]:
for i, data in enumerate(imdb_train):
    text, label = data  # if data is a list with 2 elements: text and label
    print(f"Sample {i}:")
    print(f"Text: {text}")
    print(f"Label: {label}")
    if i == 9:
        break


Sample 0:
Text: ["b'zentropa" 'has' 'much' 'in' 'common' 'with' 'the' 'third' 'man'
 'another' 'noirlike' 'film' 'set' 'among' 'the' 'rubble' 'of' 'postwar'
 'europe' 'like' 'ttm' 'there' 'is' 'much' 'inventive' 'camera' 'work'
 'there' 'is' 'an' 'innocent' 'american' 'who' 'gets' 'emotionally'
 'involved' 'with' 'a' 'woman' 'he' 'doesnt' 'really' 'understand' 'and'
 'whose' 'naivety' 'is' 'all' 'the' 'more' 'striking' 'in' 'contrast'
 'with' 'the' 'nativesbr' 'br' 'but' 'id' 'have' 'to' 'say' 'that' 'the'
 'third' 'man' 'has' 'a' 'more' 'wellcrafted' 'storyline' 'zentropa' 'is'
 'a' 'bit' 'disjointed' 'in' 'this' 'respect' 'perhaps' 'this' 'is'
 'intentional' 'it' 'is' 'presented' 'as' 'a' 'dreamnightmare' 'and'
 'making' 'it' 'too' 'coherent' 'would' 'spoil' 'the' 'effect' 'br' 'br'
 'this' 'movie' 'is' 'unrelentingly' 'grimnoir' 'in' 'more' 'than' 'one'
 'sense' 'one' 'never' 'sees' 'the' 'sun' 'shine' 'grim' 'but'
 'intriguing' 'and' "frightening'"]
Label: [1]
Sample 1:
Text: ["b'z

In [19]:
imdb_train = imdb_train.map(operations=[lookup_op, pad_op], input_columns=['text'])
imdb_train = imdb_train.map(operations=[type_cast_op], input_columns=['label'])

imdb_test = imdb_test.map(operations=[lookup_op, pad_op], input_columns=['text'])
imdb_test = imdb_test.map(operations=[type_cast_op], input_columns=['label'])


In [20]:
for i, data in enumerate(imdb_train):
    text, label = data  # if data is a list with 2 elements: text and label
    print(f"Sample {i}:")
    print(f"Text: {text}")
    print(f"Label: {label}")
    if i == 4:
        break


Sample 0:
Text: [400000     31    181      6    861     17      0    245    300    170
 400000    319    208    244      0   7860      3   9752    525    117
 154351     63     14    181  24065   3534    161     63     14     29
   4114    140     38   1666  10809    791     17      7    787     18
 136283    588   1906      5    507 110195     14     64      0     56
   4517      6   3313     17      0 400000  30410     34   9849     33
      4    203     12      0    245    300     31      7     56 400000
  13303 223958     14      7   1594  42131      6     37   1983   1472
     37     14  15493     20     14   1923     19      7 400000      5
    433     20    317  17428     54  22209      0   1261  30410  30410
     37   1005     14 109159 400000      6     56     73     48   1380
     48    332   3109      0   1662  11835   8973     34  12792      5
 400000 400001 400001 400001 400001 400001 400001 400001 400001 400001
 400001 400001 400001 400001 400001 400001 400001 400001 4000

In [21]:
imdb_train, imdb_valid = imdb_train.split([0.7, 0.3])
len(imdb_train)

17500

In [22]:
len(imdb_valid)

7500

In [23]:
imdb_train = imdb_train.batch(64, drop_remainder=True)
imdb_valid = imdb_valid.batch(64, drop_remainder=True)

In [24]:
from mindspore import Tensor, nn, ops
from mindspore.common.initializer import Normal

class CNN(nn.Cell):
    def __init__(self, embeddings, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        #super(CNN, self).__init__()
        #self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        super().__init__()
        vocab_size, embedding_dim = embeddings.shape
        self.embedding = nn.Embedding(vocab_size, embedding_dim, embedding_table=ms.Tensor(embeddings), padding_idx=pad_idx)

        self.convs = nn.CellList([
            nn.Conv2d(1, n_filters, (fs, embedding_dim), pad_mode = 'valid', has_bias=True) for fs in filter_sizes
        ])

        self.fc = nn.Dense(len(filter_sizes) * n_filters, output_dim, has_bias=True, weight_init=Normal(0.02))
        
        self.dropout = nn.Dropout(1 - dropout)

        self.relu = nn.ReLU()
        #self.reshape = nn.Reshape()
        self.cat = ops.Concat(axis=1)

    def construct(self, text):
        # embedded = [batch size, 1, sent len, emb dim]
        embedded = self.embedding(text)
        
        embedded = embedded.unsqueeze(1)
        
        squeeze = ops.Squeeze(3)

        conved = [squeeze(self.relu(conv(embedded))) for conv in self.convs]

        
        squeeze = ops.Squeeze(2)
        #max_pool = nn.MaxPool1d(kernel_size=None)
        pooled = [squeeze(nn.MaxPool1d(kernel_size=conv.shape[2])(conv)) for conv in conved]
        

        # cat = [batch size, n_filters * len(filter_sizes)]
        cat = self.dropout(self.cat(tuple(pooled)))

        return self.fc(cat)


In [25]:
#INPUT_DIM = len(TEXT.vocab)
#EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3,4,5]
OUTPUT_DIM = 1
DROPOUT = 0.5
#PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
lr = 0.001
PAD_IDX = vocab.tokens_to_ids('<pad>')

model = CNN(embeddings, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
loss_fn = nn.BCEWithLogitsLoss(reduction='mean')
optimizer = nn.Adam(model.trainable_params(), learning_rate=lr)




In [26]:
def forward_fn(data, label):
    logits = model(data)
    loss = loss_fn(logits, label)
    return loss

grad_fn = ms.value_and_grad(forward_fn, None, optimizer.parameters)

def train_step(data, label):
    loss, grads = grad_fn(data, label)
    optimizer(grads)
    return loss

def train_one_epoch(model, train_dataset, epoch=0):
    model.set_train()
    total = train_dataset.get_dataset_size()
    loss_total = 0
    step_total = 0
    with tqdm(total=total) as t:
        t.set_description('Epoch %i' % epoch)
        for i in train_dataset.create_tuple_iterator():
            loss = train_step(*i)
            loss_total += loss.asnumpy()
            step_total += 1
            t.set_postfix(loss=loss_total/step_total)
            t.update(1)


In [27]:
def binary_accuracy(preds, y):
    """
    Calculate the accuracy of each batch.
    """
    # Round off the predicted value.
    rounded_preds = np.around(ops.sigmoid(preds).asnumpy())
    correct = (rounded_preds == y).astype(np.float32)
    acc = correct.sum() / len(correct)
    return acc


In [28]:
def evaluate(model, test_dataset, criterion, epoch=0):
    total = test_dataset.get_dataset_size()
    epoch_loss = 0
    epoch_acc = 0
    step_total = 0
    model.set_train(False)

    with tqdm(total=total) as t:
        t.set_description('Epoch %i' % epoch)
        for i in test_dataset.create_tuple_iterator():
            predictions = model(i[0])
            loss = criterion(predictions, i[1])
            epoch_loss += loss.asnumpy()

            acc = binary_accuracy(predictions, i[1])
            epoch_acc += acc

            step_total += 1
            t.set_postfix(loss=epoch_loss/step_total, acc=epoch_acc/step_total)
            t.update(1)

    return epoch_loss / total


In [29]:
num_epochs = 5
best_valid_loss = float('inf')
ckpt_file_name = os.path.join(cache_dir, 'analysis.ckpt')

for epoch in range(num_epochs):
    train_one_epoch(model, imdb_train, epoch)
    valid_loss = evaluate(model, imdb_valid, loss_fn, epoch)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        ms.save_checkpoint(model, ckpt_file_name)

Epoch 0: 100%|██████████| 273/273 [00:12<00:00, 21.72it/s, loss=0.455]
Epoch 0: 100%|██████████| 117/117 [00:08<00:00, 13.64it/s, acc=0.857, loss=0.339]
Epoch 1: 100%|██████████| 273/273 [00:09<00:00, 29.13it/s, loss=0.32] 
Epoch 1: 100%|██████████| 117/117 [00:08<00:00, 13.92it/s, acc=0.895, loss=0.267]
Epoch 2: 100%|██████████| 273/273 [00:09<00:00, 29.89it/s, loss=0.256]
Epoch 2: 100%|██████████| 117/117 [00:08<00:00, 13.55it/s, acc=0.911, loss=0.227]
Epoch 3: 100%|██████████| 273/273 [00:09<00:00, 30.05it/s, loss=0.2]  
Epoch 3: 100%|██████████| 117/117 [00:08<00:00, 13.36it/s, acc=0.945, loss=0.158]
Epoch 4: 100%|██████████| 273/273 [00:09<00:00, 29.99it/s, loss=0.168]
Epoch 4: 100%|██████████| 117/117 [00:08<00:00, 13.75it/s, acc=0.96, loss=0.124] 


In [30]:
for element in imdb_train.take(1):
    print(element[0].shape, element[1].shape)


(64, 500) (64, 1)


In [31]:
param_dict = ms.load_checkpoint(ckpt_file_name)
ms.load_param_into_net(model, param_dict)


([], [])

In [32]:
imdb_test = imdb_test.batch(64)
evaluate(model, imdb_test, loss_fn)


Epoch 0: 100%|██████████| 391/391 [00:22<00:00, 17.58it/s, acc=0.859, loss=0.342]


0.34162890688156533

In [45]:
def pad_sentence(sentence, max_length, pad_token='<pad>'):
    tokenized = sentence.lower().split()
    
    if len(tokenized) < max_length:
        tokenized += [pad_token] * (max_length - len(tokenized))
    else:
        tokenized = tokenized[:max_length]
    
    return tokenized

score_map = {
    1: "Positive",
    0: "Negative"
}

def predict_sentiment(model, vocab, sentence, max_length=500):
    model.set_train(False)
    
    # Pad sentence
    tokenized = pad_sentence(sentence, max_length)
    
    # Convert tokens to ids
    indexed = [vocab.tokens_to_ids(token) for token in tokenized]
    
    # Create tensor and expand dims
    tensor = ms.Tensor(indexed, ms.int32)
    tensor = tensor.expand_dims(0)
    
    # Predict sentiment
    prediction = model(tensor)
    
    # Convert prediction to sentiment
    sentiment = score_map[int(np.round(ops.sigmoid(prediction).asnumpy()))]
    
    return sentiment


In [46]:
predict_sentiment(model, vocab, "This film has got to be the epitome of terrible writing and should be a classroom example of 'what not to do' when writing a screenplay. Why would Joshua take on (clearly) amateur writer Adam Gaines script is beyond me. Even his good directing and excellent cinematography could not save this disaster.")

'Negative'

In [47]:
predict_sentiment(model, vocab, "That's well-paced and not slow and meandering. Good storytelling. Good dialogue that keeps it real.")

'Positive'

In [48]:
predict_sentiment(model, vocab, "This film is terrible")


'Negative'

In [49]:
predict_sentiment(model, vocab, "This film is great")


'Positive'

In [None]:
#thank you!

In [None]:
#thank you!

In [None]:
#thank you!