In [15]:
import torch
from torch import nn
import random
import os
import numpy as np
from gensim.models.keyedvectors import KeyedVectors
import texthero as hero
import pandas as pd
# from map_to_id_80 import IDMapping


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


class FeatureExtractor:
    def __init__(self, filepath):
        pass

    def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        return hero.clean(df)

    def make_feature(self, titles: list, dic: dict) -> torch.tensor:
        X = []
        maxLen = 0
        for i, title in enumerate(titles):
            words = title.split(' ')
            if (len(words) >= maxLen):
                maxLen = len(words)
            l = []
            for word in words:
                if word in dic:
                    l.append(dic[word])
                else:
                    l.append(0)
            X.append(torch.tensor(l, dtype=torch.int))  # n_samples x seq_len x  (10672 x variable_titlelen: max 303)
        print(maxLen)
        print(len(dic))
        X = nn.utils.rnn.pad_sequence(X) #303 x 10672, first row correspond to every first words of the articles 
        print(X)
        print(X[0])
        return X

    def make_feature_pipeline(self, df: pd.DataFrame,dic: dict) -> torch.tensor:
        df['clean_title'] = self.preprocess(df)
        titles = df['clean_title'].tolist()
        return self.make_feature(titles=titles, dic=dic)


class RNN(nn.Module):
    def __init__(self, input_size: int,#300
                 hidden_size: int, #50
                 output_size: int,#4
                 n_vocab: int):#12455
        super().__init__()
        self.embedding = nn.Embedding(n_vocab, input_size)#n_vocab = vocab_size
        self.rnn = nn.RNN(input_size=input_size,#300
                          hidden_size=hidden_size,#50
                          num_layers=1,
                          nonlinearity='tanh',##activation function
                          bias=True,
                          bidirectional=False)
        self.fc = nn.Linear(in_features=hidden_size, #50
                            out_features=output_size, #4
                            bias=True)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, x: torch.tensor, h_0: torch.tensor):
        x = self.embedding(x)  # seq_len x n_samples x n_dim(embed) (303,10672,300)
        # x = x.permute(1, 0, 2)
        x, h_T = self.rnn(x, h_0) #x dim = (303,10672,50)
        x = self.fc(x) #(303,10672,4)
        x = self.softmax(x)
        return x, h_T


def words_to_ids(df: pd.DataFrame):
    dictionary = {}

    df["clean-title"] = hero.clean(df['title'])
    titles = df['clean-title'].tolist()
    for title in titles:
        for word in title.split(' '):
            if word in dictionary:
                dictionary[word] += 1
            else:
                dictionary[word] = 1
    idx = 1
    # print(Counter(dictionary))
    # print((sorted(dictionary, key=dictionary.get, reverse=True)))
    for word in sorted(dictionary, key=dictionary.get, reverse=True):
        if dictionary[word] == 1:
            dictionary[word] = 0
        else:
            dictionary[word] = idx
            idx += 1
    sorted_dict = dict(sorted(dictionary.items(), key=lambda x: x[1]))
    return sorted_dict


seed_everything()
filepath_bin = '../Data/Chapter7/GoogleNews-vectors-negative300.bin'
filedir_text = '../Data/Output/Chapter6/'
train_path = os.path.join(filedir_text, 'Ex50-train.txt')

hidden_size = 50
input_size = 300
output_size = 4 #The number of categories

fe = FeatureExtractor(filepath=filepath_bin)
df_train = pd.read_csv(train_path, sep='\t')

dic = words_to_ids(df_train)
x_train = fe.make_feature_pipeline(df=df_train['title'],
                                    dic=dic) #303 x 10672
#each row of size 10672 represents the i-th index word of all the titles,  and 303 is the length of the longest title
batch_size = x_train.shape[1] ##10672

net = RNN(input_size=input_size,
            hidden_size=hidden_size,
            output_size=output_size,
            n_vocab=len(dic))
output, h_T = net(x_train, h_0=torch.zeros(1, batch_size, hidden_size))
print(output.shape) #(303,10672,4)  each of (10672,4) matrix represents for the i-th character of the articles in all the articles, which goes as a sequence. Output of i-th matrix is fed into the next (i+1)th matrix

print(output[-1, 0, :]) #the output as for category distribution for the first article

  return s.str.replace(pattern, symbols)
  return s.str.replace(pattern, "")
  return s.str.replace(rf"([{string.punctuation}])+", symbol)
  return s.str.replace(pattern, symbols)
  return s.str.replace(pattern, "")
  return s.str.replace(rf"([{string.punctuation}])+", symbol)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['clean_title'] = self.preprocess(df)


303
12455
tensor([[ 132, 1548,    0,  ..., 5332,  395,  188],
        [5400, 1413, 2670,  ...,   22,    5,  860],
        [   9,    0, 1085,  ...,  166,   98, 2034],
        ...,
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0],
        [   0,    0,    0,  ...,    0,    0,    0]], dtype=torch.int32)
tensor([ 132, 1548,    0,  ..., 5332,  395,  188], dtype=torch.int32)
torch.Size([303, 10672, 4])
tensor([0.2189, 0.4139, 0.0998, 0.2673], grad_fn=<SliceBackward0>)
