In [None]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m100.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.2


In [None]:
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
import numpy as np
import pandas as pd
from torch.optim import Adam
from tqdm import tqdm
import math

In [None]:
df = pd.read_csv('/content/reviews_proc.csv')
df_train = df.sample(frac=0.85, random_state=25)
df_val = df.drop(df_train.index)

df_test = pd.read_csv('/content/test_reviews_proc.csv')

In [None]:
df_train

Unnamed: 0,review,score
23700,pro jury although lead actress strikingly beau...,8
10863,whats written poster birth given 6 years live ...,3
9226,movie absolutely pathetic pitiful screenplay l...,2
12689,complete contrast opinions review film actuall...,8
23045,fidois odd film many ways good one first thoug...,7
...,...,...
6688,wanted like film yes saw blah blah blah ripoff...,2
6743,time movie myopic desire present particular en...,3
5555,finally got look experimental lynch short wait...,3
7463,absolutely adore toxic avenger series weak off...,2


In [None]:
class Dataset(torch.utils.data.Dataset):
    '''Класс датасета. Токенизирует входной датафрейм и преобразует лейблы в int64'''

    def __init__(self, df, label_name, data_name):
        
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
        self.labels = [label for label in df[label_name]] 
        self.data = [tokenizer(str(text), 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df[data_name]]

        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.data[idx], np.array(self.labels[idx], dtype=np.int64 )

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(self.__class__, self).__init__()
        
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.relu = nn.ReLU()
        
    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        #final_layer = self.relu(linear_output)

        #return final_layer
        return linear_output

In [None]:
def train(model, train_dataloader, val_dataloader, learning_rate, epochs, optimizer_state=None):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    model.to(device)

    #criterion = nn.CrossEntropyLoss()
    criterion = nn.MSELoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if(optimizer_state is not None):
        optimizer.load_state_dict(torch.load(optimizer_state))

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()
    
    best_score = math.inf

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, train_label.to(torch.float32).unsqueeze(1))
                total_loss_train += batch_loss.item()

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)
                    
                    output = model(input_id, mask)
                    
                    batch_loss = criterion(output, val_label.to(torch.float32).unsqueeze(1))
                    total_loss_val += batch_loss.item()
                    
            if(total_loss_val / len(val_dataloader.dataset) < best_score):
                best_score = total_loss_val / len(val_dataloader.dataset)
                print(f'saving {best_score}')
                torch.save(model.state_dict(), f'best.pt')
                torch.save(optimizer.state_dict(), f'opt_state_best.pt')

                    
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataloader.dataset): .3f} \
                | Train Accuracy: {total_acc_train / len(train_dataloader.dataset): .3f} \
                | Val Loss: {total_loss_val / len(val_dataloader.dataset): .3f} \
                | Val Accuracy: {total_acc_val / len(val_dataloader.dataset): .3f}')

In [None]:
train_ds, val_ds = Dataset(df_train, 'score', 'review'), Dataset(df_val, 'score', 'review')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
batch_size = 23
lr = 1e-5
epochs = 1

In [None]:
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=batch_size)

In [None]:
model = BertClassifier()
train(model, train_dl, val_dl, lr, epochs)

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 924/924 [30:54<00:00,  2.01s/it]


saving 0.15429803910255432
Epochs: 1 | Train Loss:  0.265                 | Train Accuracy:  0.000                 | Val Loss:  0.154                 | Val Accuracy:  0.000


Дообучаем модель с уменьшенным lr

In [None]:
lr = 1e-7

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/models/reviews/best.pt'))
train(model, train_dl, val_dl, lr, epochs, optimizer_state='/content/opt_state_best.pt')

100%|██████████| 924/924 [31:01<00:00,  2.01s/it]


saving 0.1513309367497762
Epochs: 1 | Train Loss:  0.066                 | Train Accuracy:  0.000                 | Val Loss:  0.151                 | Val Accuracy:  0.000


In [None]:
def evaluate(model, test_dl):
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    model.to(device)
    model.eval()
            
    total_loss_val = 0
    ans = []

    with torch.no_grad():

        for test_input, test_label in tqdm(test_dl):

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)
                    
            output = model(input_id, mask)

            #print(output.squeeze().shape)
            ans.extend(output.squeeze().tolist())
                    
    return(ans)

In [None]:
test_ds = Dataset(df_test, 'score', 'review')

In [None]:
test_dl = torch.utils.data.DataLoader(test_ds, batch_size=200)

In [None]:
ans = evaluate(model, test_dl)

100%|██████████| 125/125 [14:05<00:00,  6.76s/it]


In [None]:
targets = df_test['score'].tolist()

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

print(mean_squared_error(targets, ans))
print(mean_absolute_error(targets, ans))

3.3519418447370133
1.2760552244114876


In [None]:
ans_class = []
targets_class = []

for i in range(len(ans)):
    if(round(ans[i]) >= 7):
        ans_class.append(1)
    else:
        ans_class.append(0)

    if(targets[i] >= 7):
        targets_class.append(1)
    else:
        targets_class.append(0)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

print(precision_score(targets_class, ans_class))
print(recall_score(targets_class, ans_class))
print(f1_score(targets_class, ans_class))

0.9258318516637033
0.8748
0.8995927769322529


0.924643584521385
0.87168
0.897380991599407

0.8245144005358339
0.8453840871186108
0.8348188335593877

In [None]:
def predict(model, tokenizer, text):
    data = tokenizer(str(text), padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

    mask = data['attention_mask'].to(device)
    input_ids = data['input_ids'].squeeze(1).to(device)

    output = model(input_ids, mask)

    print(output)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
predict(model, tokenizer, 'David Bryces comments nearby are exceptionally well written and informative as almost say everything I feel about DARLING LILI. This massive musical is so peculiar and over blown, over produced and must have caused ruptures at Paramount in 1970. It cost 22 million dollars! That is simply irresponsible. DARLING LILI must have been greenlit from a board meeting that said "hey we got that Pink Panther guy and that Sound Of Music gal... lets get this too" and handed over a blank cheque. The result is a hybrid of GIGI, ZEPPELIN, HALF A SIXPENCE, some MGM 40s song and dance numbers of a style (daisies and boaters!) so hopelessly old fashioned as to be like musical porridge, and MATA HARI dramatics. The production is colossal, lush, breathtaking to view, but the rest: the ridiculous romance, Julie looking befuddled, Hudson already dead, the mistimed comedy, and the astoundingly boring songs deaden this spectacular film into being irritating. LILI is like a twee 1940s mega musical with some vulgar bits to spice it up. STAR! released the year before sadly crashed and now is being finally appreciated for the excellent film is genuinely is... and Andrews looks sublime, mature, especially in the last half hour......but LILI is POPPINS and DOLLY frilly and I believe really killed off the mega musical binge of the 60s..... and made Andrews look like Poppins again... which I believe was not Edwards intention. Paramount must have collectively fainted when they saw this: and with another $20 million festering in CATCH 22, and $12 million in ON A CLEAR DAY and $25 million in PAINT YOUR WAGON....they had a financial abyss of CLEOPATRA proportions with $77 million tied into 4 films with very uncertain futures. Maybe they should have asked seer Daisy Gamble from ON A CLEAR DAY ......LILI was very popular on immediate first release in Australia and ran in 70mm cinemas for months but it failed once out in the subs and the sticks and only ever surfaced after that on one night stands with ON A CLEAR DAY as a Sunday night double. Thank god Paramount had their simple $1million (yes, ONE MILLION DOLLAR) film LOVE STORY and that $4 million dollar gangster pic THE GODFATHER also ready to recover all the $77 million in just the next two years....for just $5m.... incredible!')

tensor([[5.1296]], device='cuda:0', grad_fn=<AddmmBackward0>)
