In [None]:
import re
import nltk

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## 1. Представление и предобработка текстовых данных

1.1 Операции по предобработке:
* токенизация
* стемминг / лемматизация
* удаление стоп-слов
* удаление пунктуации
* приведение к нижнему регистру
* любые другие операции над текстом

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
text = 'Select your preferences and run the install command. Stable represents the most currently tested and supported version of PyTorch. Note that LibTorch is only available for C++'

In [None]:
def preprocess_text(text):
    # Токенизация
    tokens = word_tokenize(text)

    # Удаление пунктуации
    tokens = [token for token in tokens if token not in string.punctuation]

    # Приведение к нижнему регистру
    tokens = [token.lower() for token in tokens]

    # Удаление стоп-слов
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Стемминг
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens

In [None]:
preprocessed_text = preprocess_text(text)
preprocessed_text

['select',
 'prefer',
 'run',
 'instal',
 'command',
 'stabl',
 'repres',
 'current',
 'test',
 'support',
 'version',
 'pytorch',
 'note',
 'libtorch',
 'avail',
 'c++']

Реализовать функцию `preprocess_text(text: str)`, которая:
* приводит строку к нижнему регистру
* заменяет все символы, кроме a-z, A-Z и знаков .,!? на пробел


In [None]:
import re

def preprocess_tex(text):
    text = text.lower()

    text = re.sub(r'[^a-zA-Z.,!?]', ' ', text)

    return text

In [None]:
preprocessed_tex = preprocess_tex(text)
preprocessed_tex

'select your preferences and run the install command. stable represents the most currently tested and supported version of pytorch. note that libtorch is only available for c  '

1.2 Представление текстовых данных при помощи бинарного кодирования


Представить первое предложение из `text` в виде тензора `sentence_t`: `sentence_t[i] == 1`, если __слово__ с индексом `i` присуствует в предложении.

In [None]:
def create_vocabulary(sentence):
    words = sentence.lower().split()
    vocabulary = list(set(words))
    return vocabulary

In [None]:
def binary_encode(sentence, vocabulary):
    words = sentence.split()
    sentence_t = torch.zeros(len(vocabulary))

    for i, word in enumerate(vocabulary):
        if word in words:
            sentence_t[i] = 1

    return sentence_t

vocabulary = create_vocabulary(text)

sentence_t = binary_encode(text, vocabulary)
print(sentence_t)


tensor([1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1.,
        1., 0., 1., 1., 1., 1., 1.])


## 2. Классификация фамилий по национальности

Датасет: https://disk.yandex.ru/d/owHew8hzPc7X9Q?w=1

2.1 Считать файл `surnames/surnames.csv`.

2.2 Закодировать национальности числами, начиная с 0.

2.3 Разбить датасет на обучающую и тестовую выборку

2.4 Реализовать класс `Vocab` (токен = __символ__)

2.5 Реализовать класс `SurnamesDataset`

2.6. Обучить классификатор.

2.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: прогнать несколько фамилий студентов группы через модели и проверить результат. Для каждой фамилии выводить 3 наиболее вероятных предсказания.

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset
import torch

In [None]:
df = pd.read_csv('surnames.csv')

In [None]:
df.head()

Unnamed: 0,surname,nationality
0,Woodford,English
1,Coté,French
2,Kore,English
3,Koury,Arabic
4,Lebzak,Russian


In [None]:
nationalities = df.nationality.unique()
nationality_to_idx = dict(zip(nationalities, range(len(nationalities))))
idx_to_nationalities = {v: u for u, v in nationality_to_idx.items()}
idx_to_nationalities

{0: 'English',
 1: 'French',
 2: 'Arabic',
 3: 'Russian',
 4: 'Japanese',
 5: 'Chinese',
 6: 'Italian',
 7: 'Czech',
 8: 'Irish',
 9: 'German',
 10: 'Greek',
 11: 'Spanish',
 12: 'Polish',
 13: 'Dutch',
 14: 'Vietnamese',
 15: 'Korean',
 16: 'Portuguese',
 17: 'Scottish'}

In [None]:
surnames_train, surnames_test = train_test_split(df, test_size=0.2)


In [None]:
class Vocab:
    def __init__(self, data: pd.DataFrame):
        data = data[['surname']].drop_duplicates()

        letters = data.surname.str.lower().str.split('').apply(lambda x: x[1:-1])
        letters = set(letters.sum())

        self.idx_to_token = dict(zip(range(1, len(letters) + 1), letters))
        self.idx_to_token[0] = ''

        self.token_to_idx = dict(zip(letters, range(1, len(letters) + 1)))
        self.token_to_idx[''] = 0

        self.vocab_len = len(self.idx_to_token)


In [None]:
vocab = Vocab(df)
vocab.vocab_len, vocab.token_to_idx['z'], vocab.idx_to_token[1]

(56, 46, 's')

In [None]:
MAX_SURNAME_LEN = df.surname.str.len().max()
MAX_SURNAME_LEN

17

In [None]:
class SurnamesDataset(Dataset):
    def __init__(self, x, y, _vocab: Vocab):
        self.x = x
        self.y = y
        self.vocab = _vocab

    def vectorize(self, _surname: str):
        output_vector = torch.zeros(MAX__SURNAME_LEN)
        _surname = _surname.lower()
        for l in range(len(_surname)):
            output_vector[MAX_SURNAME_LEN - len(_surname) + l] = self.vocab.token_to_idx[_surname[l]]

        return torch.FloatTensor(output_vector)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.vectorize(self.x.iloc[idx]).long(), torch.tensor(nationality_to_idx[self.y.iloc[idx]])


In [None]:
dataset_testing = SurnamesDataset(df.surname, df.nationality, Vocab(df))

print(dataset_testing.vectorize('Woodford')[-10:])

print(len(dataset_testing), dataset_testing.vocab.vocab_len)
print(dataset_testing[1][0], dataset_testing[1][1], dataset_testing.x.iloc[1])

tensor([ 0.,  0., 21., 16., 16.,  8., 22., 16., 33.,  8.])
10980 56
tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 50, 16, 29, 24]) tensor(1) Coté


In [None]:
vocab_train_dataset = SurnamesDataset(surnames_train.surname, surnames_train.nationality, Vocab(df))
vocab_test_dataset = SurnamesDataset(surnames_test.surname, surnames_test.nationality, Vocab(df))

kernels = [MAX_SURNAME_LEN, 32, 32, 64, 64]
convs = torch.nn.Sequential()
for i in range(len(kernels) - 1):
    convs.add_module(f'conv {i}', torch.nn.Conv1d(kernels[i], kernels[i + 1], 3))
    convs.add_module(f'pooling {i}', torch.nn.MaxPool1d(2))
    convs.add_module(f'func {i}', torch.nn.LeakyReLU(0.1))

model = torch.nn.Sequential(
    torch.nn.Embedding(vocab.vocab_len, 128),
    convs,
    torch.nn.Dropout(0.5),
    torch.nn.Flatten(),
    torch.nn.Linear(384, 512),
    torch.nn.Dropout(0.5),
    torch.nn.LeakyReLU(0.1),
    torch.nn.Linear(512, len(nationality_to_idx))
)

model(next(iter(DataLoader(vocab_train_dataset)))[0]).shape

torch.Size([1, 18])

In [None]:
optimizer = torch.optim.Adam(model.parameters(), weight_decay=0.001)
loss = torch.nn.CrossEntropyLoss()
loss_log = []
loss_log_val = []
min_val_loss = 10 ** 10

for i in range(10):
    epoch_loss = 0
    epoch_loss_val = 0

    j, k = 1, 1

    model.train()
    for j, (batch_x, batch_y) in enumerate(DataLoader(vocab_train_dataset, batch_size=64, shuffle=True), 1):
        y_pred = model(batch_x)
        running_loss = loss(y_pred, batch_y)
        epoch_loss += running_loss.item()

        running_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    for k, (batch_x, batch_y) in enumerate(DataLoader(vocab_test_dataset, batch_size=64), 1):
        y_pred = model(batch_x)
        running_loss = loss(y_pred, batch_y)
        epoch_loss_val += running_loss.item()

    epoch_loss /= j
    epoch_loss_val /= k

    if i % 5 == 0:
        print(f'EPOCH: {i + 1:3d} \t LOSS: {epoch_loss:0.4f} \t VAL LOSS: {epoch_loss_val:0.4f}')

    loss_log.append(epoch_loss)
    loss_log_val.append(epoch_loss_val)


EPOCH:   1 	 LOSS: 0.8534 	 VAL LOSS: 0.9299
EPOCH:   6 	 LOSS: 0.7461 	 VAL LOSS: 0.9129


In [None]:
model.eval()
right_answers = 0
answers = []
for batch_x, batch_y in DataLoader(vocab_train_dataset, batch_size=32):
    predictions = model(batch_x).argmax(dim=1)
    answers.extend(map(lambda x: idx_to_nationalities[int(x)], predictions))
    right_answers += (torch.eq(batch_y, predictions)).sum()

print(f'TRAIN ACCURACY: {right_answers / len(vocab_train_dataset):0.4f}')



TRAIN ACCURACY: 0.8204


In [None]:
right_answers = 0
answers = []
for batch_x, batch_y in DataLoader(vocab_test_dataset, batch_size=32):
    predictions = model(batch_x).argmax(dim=1)
    answers.extend(map(lambda x: idx_to_nationalities[int(x)], predictions))
    right_answers += (torch.eq(batch_y, predictions)).sum()

print(f'TEST ACCURACY: {right_answers / len(vocab_test_dataset):0.4f}')

TEST ACCURACY: 0.7281


In [None]:
nationalities_tests = [idx_to_nationalities[int(dataset_testing[i][1])] for i in range(len(dataset_testing))]

In [None]:
def tokenize_surname(_surname: str) -> torch.LongTensor:
    dataset = SurnamesDataset(None, None, vocab)
    return dataset.vectorize(_surname).reshape(1, MAX_SURNAME_LEN).long()

In [None]:
our_surnames = {
    'Arakelyan': 'Russian',
    'Plastinina': 'Russian',
    'Barbarich': 'Russian',
    'Akhmad': 'Arabic',
    'Tsoi': 'Korean',
    'Frolov': 'Russian',
    'Geraskina': 'Russian',
    'Phan': 'Vietnamese'
}

for surname, nationality in our_surnames.items():
    model_nat = idx_to_nationalities[int(model(tokenize_surname(surname)).argmax(dim=1))]
    if model_nat == nationality:
        flag = '+'
    else:
        flag = '-'

    print(f'{flag} {surname.capitalize():>10} \t REAL: {nationality:>10} \t MODEL: {model_nat}')

-  Barbarich 	 REAL:    Russian 	 MODEL: English
-     Akhmad 	 REAL:     Arabic 	 MODEL: English
-       Tsoi 	 REAL:     Korean 	 MODEL: Chinese
+  Arakelyan 	 REAL:    Russian 	 MODEL: Russian
- Plastinina 	 REAL:    Russian 	 MODEL: Italian
+     Frolov 	 REAL:    Russian 	 MODEL: Russian
-  Geraskina 	 REAL:    Russian 	 MODEL: Italian
-       Phan 	 REAL: Vietnamese 	 MODEL: Chinese


## 3. Классификация обзоров ресторанов

Датасет: https://disk.yandex.ru/d/nY1o70JtAuYa8g

3.1 Считать файл `yelp/raw_train.csv`. Оставить от исходного датасета 10% строчек.

3.2 Воспользоваться функцией `preprocess_text` из 1.1 для обработки текста отзыва. Закодировать рейтинг числами, начиная с 0.

3.3 Разбить датасет на обучающую и тестовую выборку

3.4 Реализовать класс `Vocab` (токен = слово)

3.5 Реализовать класс `ReviewDataset`

3.6 Обучить классификатор

3.7 Измерить точность на тестовой выборке. Проверить работоспособность модели: придумать небольшой отзыв, прогнать его через модель и вывести номер предсказанного класса (сделать это для явно позитивного и явно негативного отзыва)


In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [None]:
# 3.1

reviews_tr = pd.read_csv('raw_train.csv', names=['rating', 'review']).sample(frac=0.1, random_state=21)
reviews_tr.head()

Unnamed: 0,rating,review
402762,2,Stopped into eat while attending the 2011 iron...
68324,1,"Worst.\n\nTwice we called for a Yellow Cab, on..."
498872,1,Raw wings. Yummmm. When we reported it to the ...
528042,2,We were provided samples of several flavors of...
302874,1,This doesn't even warrant the time to write a ...


In [None]:
reviews_tr.rating.value_counts()

2    28119
1    27881
Name: rating, dtype: int64

In [None]:
# 3.2


reviews_tr = reviews_tr.reset_index(drop=True)
reviews_tr.rating += 1
reviews_tr.review = reviews_tr.review.apply(preprocess_tex)
reviews_tr.head()

Unnamed: 0,rating,review
0,1,stopped into eat while attending the iron...
1,0,"worst. n ntwice we called for a yellow cab, on..."
2,0,raw wings. yummmm. when we reported it to the ...
3,1,we were provided samples of several flavors of...
4,0,this doesn t even warrant the time to write a ...


In [None]:
# 3.3

rev_train, rev_valid = train_test_split(reviews_tr, test_size=0.2)

In [None]:
pip install pandas nltk tqdm




In [None]:

import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import pandas as pd
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
wnl = WordNetLemmatizer()

MAX_REVIEW_LEN = 64


class Vocab:
    def __init__(self, data: pd.Series):
        _word_bag = set()
        for sent in tqdm(data.review):
            sent = set(map(wnl.lemmatize, word_tokenize(sent)))
            _word_bag |= sent

        self.idx_to_token = dict(zip(range(1, len(_word_bag) + 1), _word_bag))

        self.token_to_idx = {v: u for u, v in self.idx_to_token.items()}
        self.vocab_len = len(self.idx_to_token)


vocab_review = Vocab(reviews_tr)


100%|██████████| 56000/56000 [01:57<00:00, 478.56it/s]


In [None]:
vocab_review.vocab_len, vocab_review.idx_to_token[1]

(66260, 'variant')

In [None]:
import numpy as np
class ReviewDataset(Dataset):
    def __init__(self, x: pd.Series, y: pd.Series, _vocab: Vocab):
        self.x: np.ndarray = x.values
        self.y: np.ndarray = y.values
        self.vocab = _vocab

    def vectorize(self, review: str):
        review = review.lower()
        output_tensor = torch.zeros(MAX_REVIEW_LEN, dtype=torch.long)

        for l, word in enumerate(word_tokenize(review)[:MAX_REVIEW_LEN], 1):

            word = wnl.lemmatize(word)
            if word in self.vocab.token_to_idx.keys():
                output_tensor[MAX_REVIEW_LEN - l] = self.vocab.token_to_idx[word]

        return output_tensor

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        x = self.vectorize(self.x[idx])
        return x, torch.tensor(self.y[idx]).long()

In [None]:
test = ReviewDataset(rev_valid.review, rev_valid.rating, vocab_review)
test[1]

(tensor([45037,   868, 52638,  8223,  1785, 27773, 17383, 58380, 18267, 53660,
         45037, 58380, 23621,  3351, 35248, 63697,  8654, 45037,  4451, 37868,
         27866,  1785, 64913, 14047,  8219, 31876, 58963, 54065, 16696, 64276,
         55351, 52638, 53736, 35248, 29703, 52902, 32011,  4451, 37868, 64801,
         55351, 16696, 35993, 38132, 47197, 64500, 32011, 50584, 11792, 32011,
         29278, 25143,  1108, 16988, 54065, 23621, 44704, 52153, 32008, 46469,
         44468, 27203, 64602, 16988]),
 tensor(0))

In [None]:
kernels = [MAX_REVIEW_LEN, 64, 128, 256]
convs = torch.nn.Sequential()
for i in range(len(kernels) - 1):
    convs.add_module(f'conv {i}', torch.nn.Conv1d(kernels[i], kernels[i + 1], 3))
    convs.add_module(f'pooling {i}', torch.nn.MaxPool1d(3))
    convs.add_module(f'func {i}', torch.nn.LeakyReLU(0.1))
    convs.add_module(f'dropout {i}', torch.nn.Dropout(0.1))

model = torch.nn.Sequential(
    torch.nn.Embedding(vocab_review.vocab_len + 1, 128),
    convs,
    torch.nn.Flatten(),
    torch.nn.Linear(768, 1024),
    torch.nn.ReLU(),
    torch.nn.Linear(1024, 2)
)

model(next(iter(DataLoader(test, batch_size=1)))[0])

tensor([[-0.0268, -0.0609]], grad_fn=<AddmmBackward0>)

In [None]:
review_train_ds = ReviewDataset(rev_train.review, rev_train.rating, vocab_review)
review_valid_ds = ReviewDataset(rev_valid.review, rev_valid.rating, vocab_review)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), weight_decay=0.001)
loss = torch.nn.CrossEntropyLoss()
loss_log = []
loss_log_val = []
min_val_loss = 10 ** 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for i in range(15):
    epoch_loss = 0
    epoch_loss_val = 0
    j, k = 1, 1

    model.train()
    for j, (batch_x, batch_y) in enumerate(DataLoader(review_train_ds, batch_size=64, shuffle=True), 1):
        y_pred = model(batch_x)

        running_loss = loss(y_pred, batch_y)
        epoch_loss += running_loss.item()

        running_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    for k, (batch_x, batch_y) in enumerate(DataLoader(review_valid_ds, batch_size=64, shuffle=True), 1):
        y_pred = model(batch_x)
        running_loss = loss(y_pred, batch_y)
        epoch_loss_val += running_loss.item()

    epoch_loss /= j
    epoch_loss_val /= k

    if epoch_loss_val < min_val_loss:
        torch.save(model.state_dict(), 'surname_model.pt')

    if i % 1 == 0:
        print(f'EPOCH: {i + 1:3d} \t LOSS: {epoch_loss:0.4f} \t VAL LOSS: {epoch_loss_val:0.4f}')

    loss_log.append(epoch_loss)
    loss_log_val.append(epoch_loss_val)

model.eval()
model.load_state_dict(torch.load('surname_model.pt'))


EPOCH:   1 	 LOSS: 0.6694 	 VAL LOSS: 0.6251
EPOCH:   2 	 LOSS: 0.5437 	 VAL LOSS: 0.4760
EPOCH:   3 	 LOSS: 0.4280 	 VAL LOSS: 0.4136
EPOCH:   4 	 LOSS: 0.3733 	 VAL LOSS: 0.3896
EPOCH:   5 	 LOSS: 0.3442 	 VAL LOSS: 0.3689


KeyboardInterrupt: ignored

In [None]:
test_reviews = pd.read_csv('raw_test.csv', names=['rating', 'review'])

In [None]:
test_reviews.rating -= 1
test_reviews.review = test_reviews.review.apply(preprocess_tex)
test_reviews.head()

Unnamed: 0,rating,review
0,0,ordered a large mango pineapple smoothie. stay...
1,1,quite a surprise! n nmy wife and i loved thi...
2,0,"first i will say, this is a nice atmosphere an..."
3,1,i was overall pretty impressed by this hotel. ...
4,0,video link at bottom review. worst service i h...


In [None]:
test_ds = ReviewDataset(test_reviews.review, test_reviews.rating, vocab_review)

In [None]:
right_answers = 0
review_answers = []
model.eval()

for batch_x, batch_y in tqdm(DataLoader(test_ds, batch_size=128)):
    predictions = model(batch_x).argmax(dim=1)
    right_answers += (torch.eq(batch_y, predictions)).sum()
    review_answers.extend(predictions)

print(f'TEST ACCURACY: {right_answers / len(test_ds):0.4f}')

100%|██████████| 297/297 [05:11<00:00,  1.05s/it]

TEST ACCURACY: 0.8222





In [None]:
def tokenize_review(review: str) -> torch.LongTensor:
    review = preprocess_tex(review)
    output_tensor = torch.zeros(1, MAX_REVIEW_LEN, dtype=torch.long)
    for k, word in enumerate(word_tokenize(review)):
        word = wnl.lemmatize(word)
        if word in vocab_review.token_to_idx.keys():
            output_tensor[0, k] = vocab_review.token_to_idx[word]

    return output_tensor

In [None]:
my_bad_comment = 'bad service, no comments'

model(tokenize_review(my_bad_comment)).argmax(dim=1)


tensor([0])

In [None]:
my_good_comment = "quite a surprise! n nmy wife and i loved th"

model(tokenize_review(my_good_comment)).argmax(dim=1)

tensor([0])