In [1]:
import string
valid_chars = string.ascii_letters + "ÄÖÜäöüß-" + string.punctuation + \
    string.digits + string.whitespace + '–'

def contains_valid_chars(words):
  for word in words:
    for c in word:
      if c not in valid_chars:
        return False
  return True


def filterData(path):
  
  result = []
  
  
  with open(path, "r", encoding="utf-8") as file1:
      data = file1.readlines()
      
      for line in data:
        words = line.split(';',1)
        if not contains_valid_chars(words):
          continue
        if words[0] == 'Wirtschaft' or words[0] == 'Sport':
          result.append(words)
  return result
filtered_test_data = filterData("data/test.csv")
filtered_train_data = filterData("data/train.csv")

print(len(filtered_test_data))
print(len(filtered_train_data))

244
2154


In [2]:
import spacy
import de_core_news_md

In [3]:
nlp = spacy.load('de_core_news_md')
nlp.max_length = 6000000
dataForTokens = ""


        
def tokenize(data):
    counts = dict()

    for row in data:
        doc = nlp(row[1]) 
        for token in doc:
            counts[token.text] = counts.get(token.text, 0) + 1
    newCounts = dict(sorted(counts.items(), key=lambda item: item[1], reverse=True))
    return list(newCounts.keys())[:5000]

tokenizedTrainData = tokenize(filtered_train_data)

In [4]:
word_to_idx = {}
for tag in tokenizedTrainData:
    if tag not in word_to_idx:
        word_to_idx[tag] = len(word_to_idx)   

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [6]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 256

In [7]:
tag_to_idx = {"Wirtschaft":0, "Sport":1}

In [8]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx), len(tag_to_idx))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [9]:
def prepare_sequence(seq, to_ix):
    tokens = nlp(seq)
    idxs = []
    for token in tokens:
        if token.text in to_ix:
            idxs.append(to_ix[token.text])
   
    return torch.tensor(idxs, dtype=torch.long)

def calcAccuracy():
    accuracy = 0
    with torch.no_grad():
        for tag, sentence in filtered_test_data:
            sentence_in = prepare_sequence(sentence, word_to_idx)
            tag_scores = model(sentence_in)
            idx = torch.argmax(tag_scores[-1]).item()
            if list(tag_to_idx.keys())[idx] == tag:
                accuracy+=1
    print("test accuracy: " + str(accuracy/len(filtered_test_data)))

In [10]:
calcAccuracy()

test accuracy: 0.46311475409836067


In [11]:
import time

for epoch in range(5):
    accuracy = 0
    start_time = time.time()
    i = 0
    meanLoss = 0
    for tag, sentence in filtered_train_data:
        model.zero_grad()
        sentence_in = prepare_sequence(sentence, word_to_idx)
        tag_scores = model(sentence_in)
        idx = torch.argmax(tag_scores[-1]).item()
        target = tag_to_idx[tag]
        if target == idx:
            accuracy+=1
        score = tag_scores[-1].view(1,2)
        loss = loss_function(score,torch.tensor([target], dtype = torch.long))
        meanLoss += loss
        loss.backward()
        optimizer.step()
        sys.stdout.write("\r%d/%d" %(i, len(filtered_train_data)))
        sys.stdout.flush()
        i+=1
    print()
    print("epoch: " + str(epoch))
    print("duration: " + str(time.time() - start_time))
    print("train accuracy: " + str(accuracy/len(filtered_train_data)))
    print("train loss: "+ str(meanLoss/len(filtered_train_data)))
    calcAccuracy()     

2153/2154
epoch: 0
duration: 911.1764719486237
train accuracy: 0.6736304549675023
train loss: tensor(0.5962, grad_fn=<DivBackward0>)
test accuracy: 0.6967213114754098
2153/2154
epoch: 1
duration: 813.5034081935883
train accuracy: 0.7581244196843082
train loss: tensor(0.4770, grad_fn=<DivBackward0>)
test accuracy: 0.7172131147540983
2153/2154
epoch: 2
duration: 826.5074400901794
train accuracy: 0.8064066852367688
train loss: tensor(0.4168, grad_fn=<DivBackward0>)
test accuracy: 0.7213114754098361
2153/2154
epoch: 3
duration: 760.9545559883118
train accuracy: 0.8495821727019499
train loss: tensor(0.3403, grad_fn=<DivBackward0>)
test accuracy: 0.7786885245901639
2153/2154
epoch: 4
duration: 657.5441792011261
train accuracy: 0.8593314763231198
train loss: tensor(0.3232, grad_fn=<DivBackward0>)
test accuracy: 0.7377049180327869


5000