In [7]:
import torch
from torch import nn
import random
import os
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
import texthero as hero
import pandas as pd
import torch.optim as optim


def calc_acc(tensor_pred, tensor_label: torch.tensor) -> float:
    y_te_pred = torch.argmax(tensor_pred, dim=1)
    y_label = torch.argmax(tensor_label, dim=1)
    acc = (y_te_pred == y_label).sum().item() / y_label.shape[0]

    assert acc >= 0 and acc <= 1
    return acc


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def words_to_ids(df: pd.DataFrame):
    dictionary = {}

    df["clean-title"] = hero.clean(df['title'])
    titles = df['clean-title'].tolist()
    for title in titles:
        for word in title.split(' '):
            if word in dictionary:
                dictionary[word] += 1
            else:
                dictionary[word] = 1
    idx = 1
    # print(Counter(dictionary))
    # print((sorted(dictionary, key=dictionary.get, reverse=True)))
    for word in sorted(dictionary, key=dictionary.get, reverse=True):
        if dictionary[word] == 1:
            dictionary[word] = 0
        else:
            dictionary[word] = idx
            idx += 1
    sorted_dict = dict(sorted(dictionary.items(), key=lambda x: x[1]))
    return sorted_dict


class FeatureExtractor:
    def __init__(self, filepath):
        pass

    def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        return hero.clean(df)

    def make_feature(self, titles: list, dic: dict) -> torch.tensor:
        X = []
        maxLen = 0
        for i, title in enumerate(titles):
            words = title.split(' ')
            if (len(words) >= maxLen):
                maxLen = len(words)
            l = []
            for word in words:
                if word in dic:
                    l.append(dic[word])
                else:
                    l.append(0)
            # n_samples x seq_len x  (10672 x variable_titlelen: max 303)
            X.append(torch.tensor(l, dtype=torch.int))
        # 303 x 10672, first row correspond to every first words of the articles
        X = nn.utils.rnn.pad_sequence(X)
        print(X)
        print(X[0])
        return X

    def make_feature_pipeline(self, df: pd.DataFrame,
                              dic: dict) -> torch.tensor:
        df['clean_title'] = self.preprocess(df)
        titles = df['clean_title'].tolist()
        return self.make_feature(titles=titles, dic=dic)


class RNN(nn.Module): 
    def __init__(self, input_size: int,  # 300
                 hidden_size: int,  # 50
                 output_size: int,  # 4
                 n_vocab: int):  # 12455
        super().__init__()
        self.embedding = nn.Embedding(
            n_vocab, input_size)  # n_vocab = vocab_size
        self.rnn = nn.RNN(input_size=input_size,  # 300
                          hidden_size=hidden_size,  # 50
                          num_layers= 1,
                          nonlinearity='tanh',  # activation function
                          bias=True,
                          bidirectional=False)
        self.fc = nn.Linear(in_features=hidden_size,  # 50
                            out_features=output_size,  # 4
                            bias=True)
        self.softmax = nn.Softmax(dim=2)
    def forward(self, x: torch.tensor, h_0: torch.tensor):
        x = self.embedding(x)  # seq_len x n_samples x n_dim(embed) (303,10672,300)
        # x = x.permute(1, 0, 2)
        x, h_T = self.rnn(x, h_0) #x dim = (303,10672,50)
        x = self.fc(x) #(303,10672,4)
        x = self.softmax(x)
        return x, h_T    
    
    
def train(config: dict):
    seed_everything()
    filedir_in_6 = '../Data/Output/Chapter6/'
    filedir_in_8 = '../Data/Output/Chapter8/'
    filepath_bin = '../Data/Chapter7/GoogleNews-vectors-negative300.bin'
    train_path = os.path.join(filedir_in_6, 'ex50-train.txt')
    val_path = os.path.join(filedir_in_6, 'ex50-val.txt')

    train_label_path = os.path.join(filedir_in_8, 'ex70-train_label.pt')
    y_tr_label = torch.load(train_label_path)
    y_tr_label = torch.nn.functional.one_hot(y_tr_label.long()).to(torch.float)

    val_label_path = os.path.join(filedir_in_8, 'ex70-val_label.pt')
    y_val_label = torch.load(val_label_path)
    y_val_label = torch.nn.functional.one_hot(y_val_label.long()).to(torch.float)

    hidden_size = 50
    input_size = 300
    output_size = 4

    fe = FeatureExtractor(filepath=filepath_bin)
    df_train = pd.read_csv(train_path, sep='\t')
    dic = words_to_ids(df_train)
    x_train = fe.make_feature_pipeline(df=df_train['title'],
                                       dic=dic)
    df_val = pd.read_csv(val_path, sep='\t')
    x_val = fe.make_feature_pipeline(df=df_val['title'],
                                     dic=dic)

    batch_size = x_train.shape[1]
    batch_size_val = x_val.shape[1]

    net = RNN(input_size=input_size,
              hidden_size=hidden_size,
              output_size=output_size,
              n_vocab=len(dic))

    criterion = nn.BCELoss()
    optimizer = optim.SGD(net.parameters(),
                          lr=0.001,
                          momentum=0.9)

    for epoch in range(config['epoch']):
        optimizer.zero_grad()
            #reset gradient for each batch
        output, h_T = net(x=x_train, h_0=torch.zeros(
            1, batch_size, hidden_size))
        print(output.shape) #(303,10672,300)
        y_pred = output[-1, :, :] #(10672,300)
        #prediction for all articles
        loss = criterion(y_pred, y_tr_label)
        tr_loss = loss.item()
        tr_acc = calc_acc(y_pred, y_tr_label)
        loss.backward()
        optimizer.step()

        #for val data
        output, h_T = net(x=x_val, h_0=torch.zeros(
            1, batch_size_val, hidden_size))
        y_pred = output[-1, :, :]
        loss = criterion(y_pred, y_val_label)
        val_loss = loss.item()
        val_acc = calc_acc(y_pred, y_val_label)

        print('epoch: {}, tr_loss: {:.4f}, tr_acc: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}'.format(
            epoch+1,
            tr_loss,
            tr_acc,
            val_loss,
            val_acc
        )
        )


if __name__ == '__main__':
    config = {
        'epoch': 10,
    }
    train(config=config)

  return s.str.replace(pattern, symbols)
  return s.str.replace(pattern, "")
  return s.str.replace(rf"([{string.punctuation}])+", symbol)
  return s.str.replace(pattern, symbols)
  return s.str.replace(pattern, "")
  return s.str.replace(rf"([{string.punctuation}])+", symbol)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_title'] = self.preprocess(df)
  return s.str.replace(pattern, symbols)
  return s.str.replace(pattern, "")
  return s.str.replace(rf"([{string.punctuation}])+", symbol)


tensor([[ 132, 1548,    0,  ..., 5332,  395,  188],
        [5400, 1413, 2670,  ...,   22,    5,  860],
        [   9,    0, 1085,  ...,  166,   98, 2034],
        ...,
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0]], dtype=torch.int32)
tensor([ 132, 1548,    0,  ..., 5332,  395,  188], dtype=torch.int32)
tensor([[ 814,   25,    0,  ...,  113,  279,    0],
        [ 815,   28, 1832,  ...,    1, 1491, 2151],
        [ 973,   25,    0,  ..., 6128,  908, 1157],
        ...,
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0]], dtype=torch.int32)
tensor([814,  25,   0,  ..., 113, 279,   0], dtype=torch.int32)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_title'] = self.preprocess(df)


torch.Size([303, 10672, 4])
epoch: 1, tr_loss: 0.5336, tr_acc: 0.3961, val_loss: 0.5339, val_acc: 0.4018
torch.Size([303, 10672, 4])
epoch: 2, tr_loss: 0.5332, tr_acc: 0.3961, val_loss: 0.5331, val_acc: 0.4018
torch.Size([303, 10672, 4])
epoch: 3, tr_loss: 0.5323, tr_acc: 0.3961, val_loss: 0.5320, val_acc: 0.4018
torch.Size([303, 10672, 4])
epoch: 4, tr_loss: 0.5311, tr_acc: 0.3961, val_loss: 0.5306, val_acc: 0.4018
torch.Size([303, 10672, 4])
epoch: 5, tr_loss: 0.5296, tr_acc: 0.3961, val_loss: 0.5290, val_acc: 0.4018
torch.Size([303, 10672, 4])
epoch: 6, tr_loss: 0.5279, tr_acc: 0.3961, val_loss: 0.5272, val_acc: 0.4018
torch.Size([303, 10672, 4])
epoch: 7, tr_loss: 0.5259, tr_acc: 0.3961, val_loss: 0.5253, val_acc: 0.4018
torch.Size([303, 10672, 4])
epoch: 8, tr_loss: 0.5237, tr_acc: 0.3961, val_loss: 0.5232, val_acc: 0.4018
torch.Size([303, 10672, 4])
epoch: 9, tr_loss: 0.5215, tr_acc: 0.3961, val_loss: 0.5211, val_acc: 0.4018
torch.Size([303, 10672, 4])
epoch: 10, tr_loss: 0.5192,