In [1]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove.6B.zip
# !ls -lat

In [1]:
import json
import pickle
import re
import pandas as pd
import numpy as np
import torch
from torch import nn
from tqdm.notebook import tqdm, trange
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from deep_translator import GoogleTranslator

import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
# nltk.download('wordnet')

from transformers import BertForSequenceClassification
from transformers import BertTokenizer, BertModel



In [2]:
if torch.cuda.is_available():
    print('CUDA is available. Training on GPU ...')
    device = torch.device("cuda") 
# elif torch.has_mps:
#     print('Apple ARM is available. Training on ARM')
#     device = torch.device("mps") 
else: 
    print('CUDA is not available. Training on CPU ...')
    device = torch.device("cpu") 

CUDA is not available. Training on CPU ...


In [3]:
count = 0
with open('embeddings/glove.6B.100d.txt','r') as f:
    for line in f:
        count +=1
count

400000

In [4]:
class Preparation():
    
    """
    Preparation dataset of news
    
    """
    
    def __init__(self, data: pd.DataFrame, target: pd.Series, mode: str, bert=False):
        
        pd.options.mode.chained_assignment = None
        data = data.reset_index(drop=True)
        target = target.reset_index(drop=True)
        
        data['target'] = target 
        self.max_length = 256
        self.data = data
        self.mode = mode
        self.bert = bert
        self.target = target
        self.len_ = len(data)
        
        DATA_MODES = ['train', 'test']
        if self.mode not in DATA_MODES:
            print(f"{self.mode} is not correct; correct modes: {DATA_MODES}")
            raise NameError

        self.stop_words = stopwords.words('english')
        self.lemmatizer = WordNetLemmatizer()
        
        if not(self.bert):
            self.glove_model = {}
            with open('embeddings/glove.6B.100d.txt','r') as f:
                for line in f:
                    split_line = line.split()
                    word = split_line[0]
                    embedding = np.array(split_line[1:], dtype=np.float64)
                    self.glove_model[word] = embedding
                
    def filter_(self, data: pd.DataFrame) -> np.ndarray:
        
        """
        Main data preparation, create from str dataframe numpy ndarray
        
        """
        
        data = data.to_numpy()
        print('Replacing data with embeddings...')
        
        if self.bert:
            tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
            data_null = np.zeros((data.shape[0], self.max_length))
            data_attention = np.zeros((data.shape[0], self.max_length))
        else:
            data_null = np.zeros((data.shape[0], self.max_length, 100))
        
        for row_index in trange(data.shape[0]):
            
            news_text = data[row_index][1] + data[row_index][2]
            news_text = self.str_transforms_(news_text)
            news_text = news_text[:self.max_length]
            
            if self.bert:
                news_text = ' '.join(news_text)
                bert_input = tokenizer(news_text, padding='max_length', max_length = self.max_length, 
                                truncation=True, return_tensors="pt")
                
                data_null[row_index] = bert_input['input_ids']
                data_attention[row_index] = bert_input['attention_mask']
                    
            else:
                if len(news_text) < self.max_length:
                    to_add = self.max_length - len(news_text)
                    padding = ['@None'] * to_add
                    news_text = news_text + padding
                
                for word_index in range(len(news_text)):
                    try: word_embedding = self.glove_model[news_text[word_index]]
                    except: word_embedding = np.zeros(100)
                    data_null[row_index][word_index] = word_embedding
                    
        if not(self.bert):
            data_null = np.swapaxes(data_null, 1, 2)
            return data_null, 0
        
        return data_null, data_attention
        
    def str_transforms_(self, news_text: str) -> list:
        
        """
        Cleaning str from .,!, stopwords and lemmatize it
        
        """
        
        news_text = news_text.lower()
        list_of_char = ['.', ',', '!', '?', '&']
        pattern = '[' + ''.join(list_of_char) + ']'
        news_text = re.sub(pattern, '', news_text)
        news_text = ' '.join([i for i in news_text.split( ) if i not in self.stop_words])
        word_list = nltk.word_tokenize(news_text)
        news_text = ' '.join([self.lemmatizer.lemmatize(news_text) for w in word_list])
        news_text = news_text.split()
        
        return news_text
        
    def create_vocab_(self, data: np.ndarray):
        
        """
        Create vocab of ints for embedding in bert
        
        """
        
        vocab = {}
        all_text_str = ''
        for row_index in range(data.shape[0]):
                news_text = data[row_index][1] + data[row_index][2]
                news_text = self.str_transforms_(news_text)
                news_text = ' '.join(news_text)
                all_text_str = all_text_str + ' ' + news_text
        
        index = 0
        for element in list(set(all_text_str.split())):
            vocab[element] = index
            index += 1
        vocab['@None'] = index
        
        return vocab
        
    def data_augmentation_(self, data):
        
        """
        Make data augmentation with synonyms replacement and translation to German and back
        
        """
        
        labels = self.target 
        count_to_add = labels.count() - labels.sum()
        
        data_to_augmentate = data.iloc[labels[labels == 1].index]
        
        try: data_to_augmentate = pd.concat([data_to_augmentate] * (count_to_add // labels.sum()), ignore_index=True)
        except: return data, data['target']
    
        aug = naw.SynonymAug(aug_src='wordnet')
        random_indexes = data_to_augmentate.sample(frac = 0.8).index
        
        print('Augmentate data with synonyms replace...')
        for index in tqdm(random_indexes):
            data_to_augmentate.iloc[index]['title']   = aug.augment(str(data_to_augmentate.iloc[random_indexes]['title']))
            data_to_augmentate.iloc[index]['summary'] = aug.augment(str(data_to_augmentate.iloc[random_indexes]['summary']))
        
        random_indexes = data_to_augmentate.sample(frac = 0.4).index
        trasns_ger = GoogleTranslator(source = 'auto', target = 'de')
        trasns_en  = GoogleTranslator(source = 'auto', target = 'en')
        
        print('Augmentate data with translation to deutch and back...')
        for index in tqdm(random_indexes):
            data_to_augmentate.iloc[index]['title']   = trasns_ger.translate(str(data_to_augmentate.iloc[random_indexes]['title']))
            data_to_augmentate.iloc[index]['summary'] = trasns_ger.translate(str(data_to_augmentate.iloc[random_indexes]['summary']))
            
            data_to_augmentate.iloc[index]['title']   = trasns_en.translate(str(data_to_augmentate.iloc[random_indexes]['title']))
            data_to_augmentate.iloc[index]['summary'] = trasns_en.translate(str(data_to_augmentate.iloc[random_indexes]['summary']))
            
        data = pd.concat([data, data_to_augmentate])
        print("One augmentated sentence:", data_to_augmentate.tail(1)['summary'])
        
        return data, data['target']
        
    def transform(self, save=False, save_name = "prepeared_data"):
 
        """
        Main transform data function
        
        """
    
        if self.mode == 'test':
            y = self.target
            x, attention = self.filter_(self.data)
        
        else:
            x, y = self.data_augmentation_(self.data)
            x, attention = self.filter_(x)
            
        if save:
            with open('data/' + save_name + '.npy', 'wb') as f:
                np.save(f, x)
                np.save(f, y)  
                np.save(f, attention) 
            
        return x, attention, y

In [5]:
class NewsDataset(Dataset):
    """
    Preparation dataset of news
    """
    def __init__(self, data, attention, target, mode):
        super().__init__()
        
        self.attention = attention
        self.data = data
        self.target = target.to_numpy()
        self.len_ = len(data)
        self.mode = mode

        DATA_MODES = ['train', 'val', 'test']
        if self.mode not in DATA_MODES:
            print(f"{self.mode} is not correct; correct modes: {DATA_MODES}")
            raise NameError
    
    def __len__(self):
        
        return self.len_
    
    def __getitem__(self, index): 
        
        x = self.data
        y = self.target
        
        try:
            attention = self.attention[index]
        except:
            attention = 0
            
        if self.mode == "test":
            return x[index], attention
        else:
            return x[index], attention, y[index]

In [6]:
train_df = pd.read_csv("data/data_augmentated_train.csv", index_col=0)
test_df = pd.read_csv("data/data_test.csv", index_col=0)

In [414]:
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']

X_val, X_test, y_val, y_test = train_test_split(test_df.drop('target', axis=1), test_df['target'], 
                                                          test_size=0.5, random_state=42)

In [425]:
X_train, _,  y_train = Preparation(data=X_train, target=y_train, mode='train').transform()
X_val, _,  y_val     = Preparation(data=X_val, target=y_val, mode='train').transform()
X_test, _,  y_test   = Preparation(data=X_test, target=y_test, mode='test').transform()

Replacing data with embeddings...


  0%|          | 0/5206 [00:00<?, ?it/s]

Augmentate data with synonyms replace...


  0%|          | 0/343 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()


Augmentate data with translation to deutch and back...


  0%|          | 0/172 [00:00<?, ?it/s]

One augmentated sentence: 428    After nearly two years of unrest, the company ...
Name: summary, dtype: object
Replacing data with embeddings...


  0%|          | 0/871 [00:00<?, ?it/s]

Replacing data with embeddings...


  0%|          | 0/442 [00:00<?, ?it/s]

In [426]:
X_train.shape, X_val.shape, X_test.shape

((5206, 100, 256), (871, 100, 256), (442, 100, 256))

In [7]:
def fit_epoch(model, train_loader, criterion, optimizer):
    
    running_loss = 0.0
    processed_data = 0
    
    for inputs, attention, labels in train_loader:
        if device.type == 'mps':
            inputs = inputs.to(dtype=torch.float32).to(device)
            labels = labels.to(dtype=torch.float32).to(device)
        else:
            inputs = inputs.to(device)
            labels = labels.to(device)
            model.double()

        optimizer.zero_grad()
        
        if type(attention) == int:
            outputs = model(inputs)
        else:
            outputs = model(inputs.long(), mask=attention.to(device))
            
        labels = labels.unsqueeze(1).to(torch.float64)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        preds = torch.argmax(outputs, 1)
        running_loss += loss.item() * inputs.size(0)
        processed_data += inputs.size(0)
        print(preds)
        break
    train_loss = running_loss / processed_data
    train_f1 = f1_score(labels.cpu().data, preds.cpu())
    return train_loss, train_f1

def eval_epoch(model, val_loader, creterion, optimizer):
    
    model.eval()
    running_loss = 0.0
    processed_data = 0
    
    for inputs, attention, labels in val_loader:
        if device.type == 'mps':
            inputs = inputs.to(dtype=torch.float32).to(device)
            labels = labels.to(dtype=torch.float32).to(device)
        else:
            inputs = inputs.to(device)
            labels = labels.to(device)
            model.double()
        
        with torch.set_grad_enabled(False):
            labels = labels.unsqueeze(1).to(torch.float64)
            
            if type(attention) == int:
                outputs = model(inputs)
            else:
                outputs = model(inputs.long(), mask=attention.to(device))
            
            loss = criterion(outputs, labels)
            preds = torch.argmax(outputs, 1)
        
        running_loss += loss.item() * inputs.size(0)
        processed_data += inputs.size(0)
              
    val_loss = running_loss / processed_data
    val_f1 = f1_score(labels.cpu().data, preds.cpu())
    return val_loss, val_f1

def train(model, train_dataset, val_dataset, num_epochs, lr, batch_size):
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
    
    logs = []
    log_template = "\nEpoch {ep:03d} train_loss: {t_loss:0.4f} \
    val_loss {v_loss:0.4f} train_f1 {t_f1:0.4f} val_f1 {v_f1:0.4f}"
    
    optimizer = torch.optim.Adam(model.parameters(), lr)
    criterion = nn.BCEWithLogitsLoss()
    
    with tqdm(desc='epoch', total=num_epochs) as pbar_outer:
        
        for epoch in range(num_epochs):
            
            train_loss, train_f1 = fit_epoch(model, train_loader, criterion, optimizer)
            val_loss, val_f1 = fit_epoch(model, val_loader, criterion, optimizer)
            
            logs.append((train_loss, train_f1, val_loss, val_f1))
            
            pbar_outer.update(1)
            tqdm.write(log_template.format(ep=epoch+1, t_loss=train_loss,\
                                           v_loss=val_loss, t_f1=train_f1, v_f1=val_f1))
            
    return logs

In [8]:
class Convolutional(nn.Module):
    
    def __init__(self):
        super().__init__()
        
        self.conv_1 = nn.Conv1d(100, 200, 3, stride=1)
        self.conv_2 = nn.Conv1d(200, 400, 3, stride=1)
        self.conv_3 = nn.Conv1d(400, 800, 3, stride=1)
        
        self.fc1 = nn.Linear(6400, 3200)
        self.fc2 = nn.Linear(3200, 1024)
        self.fc3 = nn.Linear(1024, 64)
        self.fc4 = nn.Linear(64, 1)
        
        self.relu = nn.LeakyReLU()
        self.sigm = nn.Sigmoid()
        self.max_pool = nn.AvgPool1d(3)
        
    def forward(self, x):
        
        x = self.max_pool(self.relu(self.conv_1(x)))
        x = self.max_pool(self.relu(self.conv_2(x)))
        x = self.max_pool(self.relu(self.conv_3(x))).flatten(start_dim=1)

        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.sigm(self.fc4(x))

        return x

In [434]:
model_cnn = Convolutional()
model_cnn = model_cnn.to(device)

model_cnn

Convolutional(
  (conv_1): Conv1d(100, 200, kernel_size=(3,), stride=(1,))
  (conv_2): Conv1d(200, 400, kernel_size=(3,), stride=(1,))
  (conv_3): Conv1d(400, 800, kernel_size=(3,), stride=(1,))
  (fc1): Linear(in_features=6400, out_features=3200, bias=True)
  (fc2): Linear(in_features=3200, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=64, bias=True)
  (fc4): Linear(in_features=64, out_features=1, bias=True)
  (relu): LeakyReLU(negative_slope=0.01)
  (sigm): Sigmoid()
  (max_pool): AvgPool1d(kernel_size=(3,), stride=(3,), padding=(0,))
)

In [436]:
val_dataset = NewsDataset(X_val, 0, y_val, 'val')
train_dataset= NewsDataset(X_train, 0, y_train, 'train')

In [437]:
history_cnn = train(model_cnn, train_dataset, val_dataset, 5, 0.0001, 64)

epoch:   0%|          | 0/5 [00:00<?, ?it/s]


Epoch 001 train_loss: 0.6962     val_loss 0.6931 train_f1 0.0000 val_f1 0.0000

Epoch 002 train_loss: 0.6931     val_loss 0.6931 train_f1 0.0000 val_f1 0.0000

Epoch 003 train_loss: 0.6931     val_loss 0.6931 train_f1 0.0000 val_f1 0.0000

Epoch 004 train_loss: 0.6931     val_loss 0.6931 train_f1 0.0000 val_f1 0.0000

Epoch 005 train_loss: 0.6931     val_loss 0.6931 train_f1 0.0000 val_f1 0.0000


In [438]:
class SentimentRNN(nn.Module):
    def __init__(self):
        super(SentimentRNN,self).__init__()
        
        self.lstm = nn.LSTM(256, 100, 20)
        self.linear = nn.Linear(10000, 1)
        self.sigm = nn.Sigmoid()
        
    def forward(self, x):
        
        x, hidden = self.lstm(x)
        x = torch.flatten(x, start_dim=1)
        
        x = self.linear(x)
        x = self.sigm(x)
    
        return x

In [439]:
model_lstm = SentimentRNN()
model_lstm = model_lstm.to(device)

model_lstm

SentimentRNN(
  (lstm): LSTM(256, 100, num_layers=20)
  (linear): Linear(in_features=10000, out_features=1, bias=True)
  (sigm): Sigmoid()
)

In [440]:
history_lstm = train(model_lstm, train_dataset, val_dataset, 5, 0.0001, 64)

epoch:   0%|          | 0/5 [00:00<?, ?it/s]


Epoch 001 train_loss: 0.7007     val_loss 0.6933 train_f1 0.0000 val_f1 0.0000

Epoch 002 train_loss: 0.6934     val_loss 0.6923 train_f1 0.0000 val_f1 0.0000

Epoch 003 train_loss: 0.6933     val_loss 0.6931 train_f1 0.0000 val_f1 0.0000

Epoch 004 train_loss: 0.6938     val_loss 0.6928 train_f1 0.0000 val_f1 0.0000

Epoch 005 train_loss: 0.6933     val_loss 0.6922 train_f1 0.0000 val_f1 0.0000


In [9]:
X_train_bert = train_df.drop('target', axis=1)
y_train_bert = train_df['target']

X_val_bert, X_test_bert, y_val_bert, y_test_bert = train_test_split(test_df.drop('target', axis=1), test_df['target'], 
                                                                    test_size=0.5, random_state=42)

In [10]:
X_train_bert, X_train_attention_bert, y_train_bert = Preparation(data=X_train_bert, target=y_train_bert, 
                                                    mode='train', bert=True).transform()
X_val_bert, X_val_attention_bert, y_val_bert = Preparation(data=X_val_bert, target=y_val_bert, 
                                                    mode='train', bert=True).transform()
X_test_bert, X_test_attention_bert, y_test_bert = Preparation(data=X_test_bert, target=y_test_bert, 
                                                    mode='test', bert=True).transform()

Replacing data with embeddings...


  0%|          | 0/5206 [00:00<?, ?it/s]

Augmentate data with synonyms replace...


  0%|          | 0/343 [00:00<?, ?it/s]

Augmentate data with translation to deutch and back...


  0%|          | 0/172 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
X_test_bert.shape, X_test_attention_bert.shape

In [None]:
class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.relu = nn.Sigmoid()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer
    
model_bert = BertClassifier()

model_bert.to(device)

In [None]:
train_dataset_bert = NewsDataset(X_train_bert, X_train_attention_bert, y_train_bert, 'train')
val_dataset_bert = NewsDataset(X_val_bert, X_val_attention_bert, y_val_bert, 'val')


In [None]:
history_bert = train(model_bert, train_dataset_bert, val_dataset_bert, 5, 0.0001, 64)