In [1]:
from collections import Counter, defaultdict
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.manifold import TSNE
import ast


In [2]:
class GloveDataset:

    def __init__(self, text,  window_size=5):
        self._window_size = window_size
        self._tokens = text
        word_counter = Counter()
        word_counter.update(self._tokens)
        self._word2id = {w: i for i, (w, _) in enumerate(word_counter.most_common())}
        self._id2word = {i: w for w, i in self._word2id.items()}
        self._vocab_len = len(self._word2id)

        self._id_tokens = [self._word2id[w] for w in self._tokens]

        self._create_coocurrence_matrix()

        print("# of words: {}".format(len(self._tokens)))
        print("Vocabulary length: {}".format(self._vocab_len))

    def _create_coocurrence_matrix(self):
        cooc_mat = defaultdict(Counter)

        for i, w in enumerate(self._id_tokens):
            start_i = max(i - self._window_size, 0)
            end_i = min(i + self._window_size + 1, len(self._id_tokens))

            for j in range(start_i, end_i):
                if i != j:
                    c = self._id_tokens[j]
                    cooc_mat[w][c] += 1 / abs(j - i)

        self._i_idx = list()
        self._j_idx = list()
        self._xij = list()

        # Create indexes and x values tensors
        for w, cnt in cooc_mat.items():
            for c, v in cnt.items():
                self._i_idx.append(w)
                self._j_idx.append(c)
                self._xij.append(v)

        self._i_idx = torch.LongTensor(self._i_idx).cuda()
        self._j_idx = torch.LongTensor(self._j_idx).cuda()
        self._xij = torch.FloatTensor(self._xij).cuda()

    def get_batches(self, batch_size):
        # Generate random idx
        rand_ids = torch.LongTensor(np.random.choice(len(self._xij), len(self._xij), replace=False))

        for p in range(0, len(rand_ids), batch_size):
            batch_ids = rand_ids[p:p + batch_size]
            yield self._xij[batch_ids], self._i_idx[batch_ids], self._j_idx[batch_ids]


In [3]:
EMBED_DIM = 300


class GloveModel(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super(GloveModel, self).__init__()
        self.wi = nn.Embedding(num_embeddings, embedding_dim)
        self.wj = nn.Embedding(num_embeddings, embedding_dim)
        self.bi = nn.Embedding(num_embeddings, 1)
        self.bj = nn.Embedding(num_embeddings, 1)

        self.wi.weight.data.uniform_(-1, 1)
        self.wj.weight.data.uniform_(-1, 1)
        self.bi.weight.data.zero_()
        self.bj.weight.data.zero_()

    def forward(self, i_indices, j_indices):
        w_i = self.wi(i_indices)
        w_j = self.wj(j_indices)
        b_i = self.bi(i_indices).squeeze()
        b_j = self.bj(j_indices).squeeze()

        x = torch.sum(w_i * w_j, dim=1) + b_i + b_j

        return x

In [4]:
data = []
with open("final2.csv", "r", encoding="utf-8") as file:
    file.readline()
    for line in file:
        res = ast.literal_eval(line.split("#")[1][:-1])
        data.extend(res)

dataset = GloveDataset(data)
glove = GloveModel(dataset._vocab_len, EMBED_DIM)
glove.cuda()

In [5]:
def weight_func(x, x_max, alpha):
    wx = (x/x_max)**alpha
    wx = torch.min(wx, torch.ones_like(wx))
    return wx.cuda()

def wmse_loss(weights, inputs, targets):
    loss = weights * F.mse_loss(inputs, targets, reduction='none')
    return torch.mean(loss).cuda()

In [6]:
optimizer = optim.Adagrad(glove.parameters(), lr=0.05)

N_EPOCHS = 100
BATCH_SIZE = 2048
X_MAX = 100
ALPHA = 0.75
n_batches = int(len(dataset._xij) / BATCH_SIZE)
loss_values = list()

In [7]:
for e in range(1, N_EPOCHS + 1):
    batch_i = 0

    for x_ij, i_idx, j_idx in dataset.get_batches(BATCH_SIZE):

        batch_i += 1

        optimizer.zero_grad()

        outputs = glove(i_idx, j_idx)
        weights_x = weight_func(x_ij, X_MAX, ALPHA)
        loss = wmse_loss(weights_x, outputs, torch.log(x_ij))

        loss.backward()

        optimizer.step()

        loss_values.append(loss.item())

        if batch_i % 100 == 0:
            print("Epoch: {}/{} \t Batch: {}/{} \t Loss: {}".format(e, N_EPOCHS, batch_i, n_batches,
                                                                    np.mean(loss_values[-20:])))

    print("Saving model...")
    torch.save(glove.state_dict(), "glove.pt")

In [8]:
import torch
class Articles(Dataset):
    def __init__(self, filename, classes, data, emb):
        self.dir = dir
        self.classes = classes
        self.df = pandas.read_csv(filename, delimiter=",", encoding="utf-8")
        self.data = data
        self.emb = emb


    def __getitem__(self, index):
        line = ast.literal_eval(self.df["text"][index])
        text = torch.empty(49,300)
        for i in range(49):
        # on a qu'une seule classe dans le dataset
            text[i,:] = torch.from_numpy(self.emb[self.data._word2id[line[i]],:])
        # labels = ast.literal_eval(self.df["topic"][index])
        # topic = torch.zeros(self.classes)
        # for label in labels:
        #     topic[label] = 1
        topic = torch.LongTensor([self.df["topic"][index]])
        return text, topic

    def __len__(self):
        return len(self.df)

def init():
    data = []
    with open("final2.csv", "r", encoding="utf-8") as file:
        file.readline()
        for line in file:
            res = ast.literal_eval(line.split("#")[1][:-1])
            data.extend(res)
    data = GloveDataset(data)
    EMBED_DIM = 300
    glove = GloveModel(data._vocab_len, EMBED_DIM)
    glove.load_state_dict(torch.load("glove.pt"))
    emb = glove.wi.weight.cpu().data.numpy() + glove.wj.weight.cpu().data.numpy()
    return data, emb

In [9]:
class TopicClassifier(torch.nn.Module):
    def __init__(self, tokens, classes):
        super(TopicClassifier, self).__init__()
        device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
        self.lstm = torch.nn.LSTM(300, 1024, batch_first=True)
        # replace (1, 1, 512) with (1, 50, 512) for prediction
        self.cell = (torch.zeros(1, 1, 1024).to(device),
                     torch.zeros(1, 1, 1024).to(device))
        self.fc = torch.nn.Linear(1024, classes)

    # dans le dataset un article n'appartient qu'à un seul topic, nous allons modifier le modele pour correspondre à cela
    def forward(self, x):
        l, self.cell = self.lstm(x, self.cell)
        l = l[:, -1, :]
        x = self.fc(l)
        # return torch.sigmoid(x)
        return x