In [None]:
import json
import torch
import numpy as np
from torch import tensor
from torch import nn
from torch import optim
import re
import random
import torch.nn.utils.prune as prune

######################

In [None]:
vocab = {"<UNK>":0}

#load the data and vocab
def load_data(file_name):
    data = []
    with open(file_name,'r') as file:
            for line in file.readlines():
                    line = json.loads(line)
                    #create vocabulary from all unique words in all sentences
                    sentence = line['sentence1'] + " " + line['sentence2']
                    words = sen2list(sentence)
                    #add if not already in vocab
                    for word in words:
                        if word not in vocab:
                            #add word to vocab dict
                            vocab[word] = len(vocab)
                    #add line to data
                    data.append(line)
    return data

def sen2list(s):
    s = s.replace("'s","")
    return re.findall(r"[\w']+|[.,!?;]",s.lower())

def sen2vec(s):
    v = []
    for word in sen2list(s):
        try:
            v.append(vocab[word])
        except:
            v.append(vocab["<UNK>"])
    return tensor(v).unsqueeze(0)



############################

def get_pretrained(vocab,filename,dim):
	M = np.zeros((len(vocab),dim))
	M[0] = 2*np.random.rand(dim)-1
	with open(filename) as file:
		for line in file.readlines():
			line = line.split()
			word = line[0]
			vec = np.array(line[1:])
			if word in vocab:
				M[vocab[word]] = vec
	return torch.from_numpy(M).float()

In [None]:
class NeuralNet(nn.Module):

	def __init__(self, vocab_size, embedding_dim, lstm_dim, hidden_dim, output_dim): # output = number tags
		super().__init__()

		#self.embedding = nn.Embedding(vocab_size, embedding_dim)
		self.embedding = nn.Embedding.from_pretrained(get_pretrained(vocab,"glove"+str(embedding_dim)+".txt",embedding_dim))
		#replace embedding with word embeddings -> getglove

		self.lstm = nn.LSTM(embedding_dim, lstm_dim, num_layers=2, bias=False) # two layers
		#self.lstm = nn.LSTM(embedding_dim, lstm_dim, num_layers=1, bias=False) # two layers

		self.hidden_layer = nn.Linear(2*lstm_dim,hidden_dim)

		self.dropout = nn.Dropout(p=0.3)
		self.output_layer = nn.Linear(hidden_dim, output_dim)

		self.relu = nn.ReLU()
		self.sigmoid = nn.Sigmoid()


	def forward(self, s1, s2):


		embed1 = torch.sum(self.embedding(s1),dim=1)
		_,(lstm_output1,_) = self.lstm(embed1.unsqueeze(0))

		embed2 = torch.sum(self.embedding(s2),dim=1)
		_,(lstm_output2,_) = self.lstm(embed2.unsqueeze(0))

		lstm_output1 = lstm_output1.squeeze(1)
		lstm_output2 = lstm_output2.squeeze(1)

		cat_rep = torch.cat((lstm_output1, lstm_output2),1)
		#2x128
		#print(cat_rep.shape)

		#what are the bloody dimensions?

		cat_rep = torch.sum(cat_rep,dim=0)

		hidden_rep = self.hidden_layer(cat_rep)
		#print(hidden_rep.shape)

		drop = self.dropout(hidden_rep)
		#print(drop.shape)

		output = self.output_layer(drop)
		#print("ouput",output.shape)

		return output.squeeze(0).squeeze(0)

################

In [None]:
#vocab, train_data = load_data('train.jsonl')
#_, test_data = load_data('test.jsonl')
#_, val_data = load_data('val.jsonl')

train_data = load_data('train.jsonl')
test_data = load_data('test.jsonl')
val_data = load_data('val.jsonl')

random.shuffle(train_data)

In [None]:
#################

our_wic = NeuralNet(len(vocab),300,50,64,1)
#prune.ln_structured(our_wic, name="hidden_layer", amount=0.5, n=2, dim=0)

parameters_to_prune = (
    (our_wic.hidden_layer, 'weight'),
    (our_wic.output_layer, 'weight'),
)

In [1]:
##################

ce = nn.BCELoss()
softmax = nn.Softmax(dim=0)
optimizer = optim.SGD(our_wic.parameters(), lr=0.02)
sig = nn.Sigmoid()


epochs = 1000
for epoch in range(epochs):

	our_wic.train()

	#print("Epoch:",i)
	total_loss = 0
	for point in train_data:
		optimizer.zero_grad()

		# a) calculate probs / get an output
		s1 = sen2vec(point["sentence1"])
		s2 = sen2vec(point["sentence2"])
		y_raw = our_wic(s1,s2)
		#y_hat = softmax(y_raw)

		y = tensor(float(point["label"]))
		# b) compute loss
		y_raw = sig(y_raw)
		loss = ce(y_raw,y)
		total_loss += loss

		# c) get the gradient
		loss.backward()

		# d) update the weights
		optimizer.step()

	print(total_loss/len(train_data))


	prune.global_unstructured(
	    parameters_to_prune,
	    pruning_method=prune.L1Unstructured,
	    amount=0.2,
	)


	our_wic.eval()

	score = 0
	for point in train_data:
		s1 = sen2vec(point['sentence1'])
		s2 = sen2vec(point['sentence2'])
		output = our_wic(s1,s2)
		result = True if output >= 0 else False
		if bool(result) == point["label"]:
			score += 1

	print("epoch:",epoch," train data ",score/len(train_data))

	score = 0
	for point in val_data:
		s1 = sen2vec(point['sentence1'])
		s2 = sen2vec(point['sentence2'])
		output = our_wic(s1,s2)
		result = True if output >= 0 else False
		if bool(result) == point["label"]:
			score += 1

	print("epoch:",epoch," val data ",score/len(val_data))

FileNotFoundError: [Errno 2] No such file or directory: 'glove300.txt'