In [210]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [211]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
import string
from collections import Counter
from tqdm.notebook import tqdm

In [212]:
class ConfigClassification:
    randon_seed = 42
    root = "../../data/raw/_classification_text/"
    dataset_file = "../../data/raw/_classification_text/IMDB Dataset.csv"
    test_size = 0.3
    batch_size = 64
    cutoff = 3

In [213]:
config = ConfigClassification()

In [214]:
np.random.seed(config.randon_seed)

In [215]:
df = pd.read_csv(f"{config.root}IMDB Dataset.csv")

In [216]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [217]:
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

In [40]:
text = df["review"].values

In [41]:
sentiment = df["sentiment"].values

In [42]:
X_train, X_test, y_train, y_test = train_test_split(text, sentiment, test_size=0.3, random_state=config.randon_seed)

In [43]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((35000,), (15000,), (35000,), (15000,))

In [44]:
df["review"][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [45]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text


In [46]:
df["review"] = df["review"].apply(preprocess_text)

In [47]:
df["review"][0]

'one of the other reviewers has mentioned that after watching just oz episode you ll be hooked . they are right , as this is exactly what happened with me . br br the first thing that struck me about oz was its brutality and unflinching scenes of violence , which set in right from the word go . trust me , this is not a show for the faint hearted or timid . this show pulls no punches with regards to drugs , sex or violence . its is hardcore , in the classic use of the word . br br it is called oz as that is the nickname given to the oswald maximum security state penitentary . it focuses mainly on emerald city , an experimental section of the prison where all the cells have glass fronts and face inwards , so privacy is not high on the agenda . em city is home to many . . aryans , muslims , gangstas , latinos , christians , italians , irish and more . . . . so scuffles , death stares , dodgy dealings and shady agreements are never far away . br br i would say the main appeal of the show i

In [100]:
class Vocabulary:
    
    def __init__(self, token2index=None, add_unk=True, unk_token="<UNK>"):
        if token2index is None:
            token2index = {}
        self._token2index = token2index
        self._add_unk = add_unk
        self._unk_token = unk_token
        self._index2token = {index: token for token, index in self._token2index.items()}
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token)
            
    def add_token(self, token):
        if token in self._token2index:
            index = self._token2index[token]
        else:
            index = len(self._token2index)
            self._token2index[token] = index
            self._index2token[index] = token
        return index
        
    def lookup_token(self, token):
        if self._add_unk:
            return self._token2index.get(token, self.unk_index)
        else:
            return self._token2index[token]
        
    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)
    
    def to_serializable(self):
        return {
            'token_to_idx': self._token2index,
            'add_unk': self._add_unk,
            'unk_token': self._unk_token
        }
        
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    
    def __len__(self):
        return len(self._token2index)


class ReviewVectorizer:
    
    def __init__(self, review_vocab, rating_vocab):
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab
    
    def vectorize(self, review):
        one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)
        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1
        return one_hot

    @classmethod
    def from_dataframe(cls, dataframe, config):
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)
        # Добавить рейтинги
        for rating in sorted(set(dataframe['sentiment'])):
            rating_vocab.add_token(rating)
        # Добавить часто встречающиеся слова, если число вхождений
        # больше указанного
        word_counts = Counter()
        for review in dataframe['review']:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        for word, count in word_counts.items():
            if count > config.cutoff:
                review_vocab.add_token(word)
        return cls(review_vocab, rating_vocab)


class ReviewDataset(Dataset):
    
    def __init__(self, config, dataframe, vectorizer):
        self.review_df = dataframe
        self.train_df, self.test_df = train_test_split(dataframe, test_size=config.test_size, random_state=config.randon_seed)
        self.train_size = len(self.train_df)
        self.test_size = len(self.test_df)
        self._vectorizer = vectorizer
        self._lookup_dict = {
            'train': (self.train_df, self.train_size),
            'test': (self.test_df, self.test_size)
        }
        self._target_split = None
        self._target_df = None
        self._target_size = None
        self.set_split('train')
        
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        return {
            'text': self._vectorizer.vectorize(row['review']),
            'target': self._vectorizer.rating_vocab.lookup_token(row['sentiment'])
        }
    
    def set_split(self, split='train'):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
    
    def get_vectorizer(self):
        return self._vectorizer
    
    def get_num_batches(self, batch_size):
        return len(self) // batch_size
    
    @classmethod
    def load_dataset_and_make_vectorizer(cls, config):
        review_df = pd.read_csv(config.dataset_file)
        return cls(config, review_df, ReviewVectorizer.from_dataframe(review_df, config))

In [128]:
%%time

dataset = ReviewDataset.load_dataset_and_make_vectorizer(config)

CPU times: user 6.77 s, sys: 39.3 ms, total: 6.81 s
Wall time: 6.81 s


In [129]:
len(dataset)

35000

In [130]:
dataset[0]

{'text': array([1., 0., 0., ..., 0., 0., 0.], dtype=float32), 'target': 0}

In [131]:
len(dataset[0]['text'])

91736

In [132]:
# 0 -> 439805
# 1 -> 169903
# 2 -> 116401
# 3 -> 91736
# 4 -> 76808

In [149]:
def generate_batches(dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [196]:
def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [218]:
class ReviewClassifier(nn.Module):
    
    def __init__(self, num_features):
        super(ReviewClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, out_features=1)
 
    def forward(self, x_in, apply_sigmoid=False):
        # x_in.shape = (batch, num_features)
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
        return y_out

In [219]:
num_features = 10
x_in = torch.ones(5, num_features)
x_in

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [220]:
out_features = 1
fc1 = nn.Linear(in_features=num_features, out_features=out_features)

In [221]:
fc1(x_in)

tensor([[-0.4967],
        [-0.4967],
        [-0.4967],
        [-0.4967],
        [-0.4967]], grad_fn=<AddmmBackward>)

In [222]:
fc1(x_in).squeeze()

tensor([-0.4967, -0.4967, -0.4967, -0.4967, -0.4967],
       grad_fn=<SqueezeBackward0>)

In [223]:
torch.sigmoid(fc1(x_in).squeeze())

tensor([0.3783, 0.3783, 0.3783, 0.3783, 0.3783], grad_fn=<SigmoidBackward>)

In [252]:
class MultilayerPerceptron(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MultilayerPerceptron, self).__init__()
        self.fc1 = nn.Linear(in_features=input_dim, out_features=hidden_dim)
        self.fc2 = nn.Linear(in_features=hidden_dim, out_features=output_dim)
 
    def forward(self, x_in, apply_softmax=False):
        # x_in.shape = (batch, num_features)
        intermediate = self.fc1(x_in)
        intermediate = F.relu(intermediate)
        output = self.fc2(intermediate)
        if apply_softmax:
            output = torch.softmax(output)
        return output

In [253]:
def model_size(model):
    print(model)
    print("All parameters: ", sum(p.numel() for p in model.parameters()))
    print("Trainable parameters: ", sum(p.numel() for p in model.parameters() if p.requires_grad))
    print("Model's state_dict:")
    for param_tensor in model.state_dict():
        print(param_tensor, "\t", model.state_dict()[param_tensor].size())

In [258]:
batch_size = 2 # число вводимых за один раз выборок
input_dim = 3
hidden_dim = 100
output_dim = 4

classifier = MultilayerPerceptron(input_dim, hidden_dim, output_dim)

In [259]:
model_size(classifier)

MultilayerPerceptron(
  (fc1): Linear(in_features=3, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=4, bias=True)
)
All parameters:  804
Trainable parameters:  804
Model's state_dict:
fc1.weight 	 torch.Size([100, 3])
fc1.bias 	 torch.Size([100])
fc2.weight 	 torch.Size([4, 100])
fc2.bias 	 torch.Size([4])


In [260]:
def describe(x):
    print("Type: {}".format(x.type()))
    print("Shape/size: {}".format(x.shape))
    print("Values: \n{}".format(x))
    
x_input = torch.rand(batch_size, input_dim)
describe(x_input)

Type: torch.FloatTensor
Shape/size: torch.Size([2, 3])
Values: 
tensor([[0.3486, 0.3149, 0.6524],
        [0.3223, 0.8980, 0.9949]])


In [261]:
y_output = classifier(x_input, apply_softmax=False)
describe(y_output)

Type: torch.FloatTensor
Shape/size: torch.Size([2, 4])
Values: 
tensor([[-0.2324,  0.1173, -0.1074,  0.3464],
        [-0.3150,  0.1336, -0.1115,  0.3681]], grad_fn=<AddmmBackward>)


In [None]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [175]:
class ConfigClassification:
    randon_seed = 42
    root = "../../data/raw/_classification_text/"
    dataset_file = "../../data/raw/_classification_text/IMDB Dataset.csv"
    test_size = 0.3
    batch_size = 64
    cutoff = 3
    model_state_file='model.pth'
    save_dir='model_storage/'
    vectorizer_file='vectorizer.json'
    batch_size=128
    early_stopping_criteria=5
    learning_rate=0.001
    num_epochs=100
    cuda=True

In [176]:
config = ConfigClassification()

In [177]:
np.random.seed(config.randon_seed)

In [178]:
def make_train_state(config):
    return {
        'epoch_index': 0,
        'train_loss': [],
        'train_acc': [],
        'val_loss': [],
        'val_acc': [],
        'test_loss': -1,
        'test_acc': -1
    }

In [179]:
train_state = make_train_state(config)

In [180]:
if not torch.cuda.is_available():
    config.cuda = False
config.device = torch.device("cuda" if config.cuda else "cpu")

In [181]:
%%time

dataset = ReviewDataset.load_dataset_and_make_vectorizer(config)

CPU times: user 6.69 s, sys: 63.2 ms, total: 6.75 s
Wall time: 6.75 s


In [182]:
vectorizer = dataset.get_vectorizer()

In [183]:
class ReviewClassifier(nn.Module):
    
    def __init__(self, num_features):
        super(ReviewClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, out_features=1)
 
    def forward(self, x_in, apply_sigmoid=False):
        # x_in.shape = (batch, num_features)
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
        return y_out

In [184]:
classifier = ReviewClassifier(num_features=len(vectorizer.review_vocab))
classifier = classifier.to(config.device)

In [199]:
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=config.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=5)

In [201]:
train_state = make_train_state(config)

In [205]:
epoch_bar = tqdm(desc='training routine',  total=config.num_epochs, position=0)

dataset.set_split('train')
train_bar = tqdm(desc='split=train', total=dataset.get_num_batches(config.batch_size), position=1, leave=True)

dataset.set_split('test')
val_bar = tqdm(desc='split=test', total=dataset.get_num_batches(config.batch_size), position=1, leave=True)

HBox(children=(FloatProgress(value=0.0, description='training routine', style=ProgressStyle(description_width=…

HBox(children=(FloatProgress(value=0.0, description='split=train', max=273.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='split=test', max=117.0, style=ProgressStyle(description_w…

In [207]:
try:
    for epoch_index in range(config.num_epochs):
        train_state['epoch_index'] = epoch_index
        # Проход в цикле по обучающему набору данных
        # Настройки: создаем генератор пакетов, устанавливаем значения
        # переменных loss и acc равными 0, включаем режим обучения
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, batch_size=config.batch_size, device=config.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # Процедура обучения состоит из пяти шагов:
            # Шаг 1. Обнуляем градиенты
            optimizer.zero_grad()
            # Шаг 2. Вычисляем выходные значения
            y_pred = classifier(x_in=batch_dict['text'].float())
            # Шаг 3. Вычисляем функцию потерь
            loss = loss_func(y_pred, batch_dict['target'].float())
            loss_batch = loss.item()
            running_loss += (loss_batch - running_loss) / (batch_index + 1)
            # Шаг 4. Получаем градиенты на основе функции потерь
            loss.backward()
            # Шаг 5. Оптимизатор обновляет значения параметров по градиентам
            optimizer.step()
            # -----------------------------------------
            # Вычисляем точность
            acc_batch = compute_accuracy(y_pred, batch_dict['target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # Проход в цикле по проверочному набору данных
        # Настройки: создаем генератор пакетов, устанавливаем значения
        # переменных loss и acc равными 0, включаем режим проверки
        dataset.set_split('test')
        batch_generator = generate_batches(dataset, batch_size=config.batch_size, device=config.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):
            # Шаг 1. Вычисляем выходные значения
            y_pred = classifier(x_in=batch_dict['text'].float())
            # Шаг 2. Вычисляем функцию потерь
            loss = loss_func(y_pred, batch_dict['target'].float())
            loss_batch = loss.item()
            running_loss += (loss_batch - running_loss) / (batch_index + 1)
            # Шаг 3. Вычисляем точность
            acc_batch = compute_accuracy(y_pred, batch_dict['target'])
            running_acc += (acc_batch - running_acc) / (batch_index + 1)

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)
        
        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

Exiting loop


In [None]:
def update_train_state(args, model, train_state):
    train_state['epoch_index'] = epoch_index
    # Проход в цикле по обучающему набору данных
    # Настройки: создаем генератор пакетов, устанавливаем значения
    # переменных loss и acc равными 0, включаем режим обучения
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=config.batch_size, device=config.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()

    for batch_index, batch_dict in enumerate(batch_generator):
        # Процедура обучения состоит из пяти шагов:
        # Шаг 1. Обнуляем градиенты
        optimizer.zero_grad()
        # Шаг 2. Вычисляем выходные значения
        y_pred = classifier(x_in=batch_dict['text'].float())
        # Шаг 3. Вычисляем функцию потерь
        loss = loss_func(y_pred, batch_dict['target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)
        # Шаг 4. Получаем градиенты на основе функции потерь
        loss.backward()
        # Шаг 5. Оптимизатор обновляет значения параметров по градиентам
        optimizer.step()
        # -----------------------------------------
        # Вычисляем точность
        acc_batch = compute_accuracy(y_pred, batch_dict['target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)
        
    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)