In [1]:
import numpy as np
import os
import torch.nn as nn
import model
import torch.optim as optim
import torch as T
from gensim.models.keyedvectors import KeyedVectors
from keras.preprocessing.text import Tokenizer
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
TRAIN_FILE = 'data/StackOverflow.txt'
EMB_FILE = 'GoogleNews-vectors-negative300.bin'
OUT_FILE = 'data/StackOverflow_gnd.txt'
EPOCHS_NUM = 50

In [3]:
def binarize(target):
    median = np.median(target, axis=1)[:, None]
    binary = np.zeros(shape=np.shape(target))
    binary[target > median] = 1
    return binary

In [4]:
with open(TRAIN_FILE, 'r') as f:
    data = [text.strip() for text in f]

trn_data, test_data = train_test_split(data, test_size = 0.15)
tokenizer = Tokenizer(char_level= False)
tokenizer.fit_on_texts(data)
full_seq = tokenizer.texts_to_sequences(trn_data)
word_index = tokenizer.word_index
max_words = len(word_index)
print (f'Found {max_words} words in the dataset')

Found 11365 words in the dataset


In [5]:
seq_lens = [len(s) for s in full_seq]
max_seq_len = max(seq_lens)

inp_data = pad_sequences(full_seq, maxlen= max_seq_len)

## Preparing Embedding Matrix
w2v = KeyedVectors.load_word2vec_format(EMB_FILE, binary= True)
Embedding_dim = 300
nb_words = min(max_words, len(word_index)) + 1
embedding_matrix = np.zeros((nb_words, Embedding_dim))

In [6]:
for word, i in word_index.items():
    if word in w2v.vocab:
        embedding_matrix[i] = w2v.word_vec(word)
    else:
        #print (word)
        pass
#target using Average Embeddings

y = {}
tfidf = tokenizer.sequences_to_matrix(full_seq, mode = 'tfidf')
denom = 1 + np.sum(tfidf, axis =1)[:, None]
normed_tfidf = tfidf/denom
average_embeddings = np.dot(normed_tfidf, embedding_matrix)
y['ae'] = average_embeddings
print (f"Shape of the Average Embeddings: {y['ae'].shape}")

Shape of the Average Embeddings: (17000, 300)


In [7]:
B = binarize(y["ae"])
target_dim = B.shape[1]

print (f'The Shape of Binarized Average Embeddings is {B.shape}')
print (f'The shape of the train_inp_data is {inp_data.shape}')

The Shape of Binarized Average Embeddings is (17000, 300)
The shape of the train_inp_data is (17000, 34)


In [64]:
from importlib import reload
reload(model)

<module 'model' from '/Users/murugeshvadivel/DEV/text_clustering/model.py'>

In [83]:
train_inp_data = model.TextData(inp_data, B)
train_dataloader = DataLoader(train_inp_data, shuffle = True, batch_size = 100)
MODEL = model.TextCluster(embedding_matrix, target_dim)

crit = nn.MSELoss()
optimizer = optim.Adam(MODEL.parameters(), lr = 1e-3, betas = [0.9, 0.999], eps = 1e-08)

In [85]:
for epoch in range(1, EPOCHS_NUM +  1):
    print (f'EPOCH-{epoch}')
    for batch in train_dataloader:
        txt_inp, bout = batch
        txt_inp = txt_inp.type(T.LongTensor)
        bout = bout.type(T.FloatTensor)
        optimizer.zero_grad()
        pred_bout = MODEL(txt_inp)
        print (pred_bout.shape)
        print (bout.shape)
        loss = crit(pred_bout, bout)
        loss.backward()
        optimizer.step()
    print(f"The loss for Epoch - {epoch} is {loss/ 100}")

EPOCH-1
torch.Size([100, 300])
torch.Size([100, 300])
torch.Size([100, 300])
torch.Size([100, 300])
torch.Size([100, 300])
torch.Size([100, 300])


KeyboardInterrupt: 

In [74]:
from torch.autograd import Variable
output = Variable(T.randn(10, 120).float())

In [79]:
target = Variable(T.FloatTensor(10).uniform_(0, 120).long())

In [82]:
target

tensor([109,  19,   7,  13, 109,  87,  85,  29,  92,  47])