In [74]:
import string
def contains_valid_chars(words):
  for word in words:
    for c in word:
      if c not in valid_chars:
        return False
  return True


def filterData(path):
  
  result = []
  valid_chars = string.ascii_letters + "ÄÖÜäöüß-" + string.punctuation + \
    string.digits + string.whitespace + '–'
  
  with open(path, "r", encoding="utf-8") as file1:
      data = file1.readlines()
      
      for line in data:
        words = line.split(';',1)
        if not contains_valid_chars(words):
          continue;
        if words[0] == 'Wirtschaft' or words[0] == 'Sport':
          result.append(words)
  return result;
filtered_test_data = filterData("data/test.csv")
filtered_train_data = filterData("data/train.csv")

print(len(filtered_test_data))
print(len(filtered_train_data))

244
2154


In [69]:
print(filtered_train_data[0])

['Sport', '21-Jähriger fällt wohl bis Saisonende aus. Wien – Rapid muss wohl bis Saisonende auf Offensivspieler Thomas Murg verzichten. Der im Winter aus Ried gekommene 21-Jährige erlitt beim 0:4-Heimdebakel gegen Admira Wacker Mödling am Samstag einen Teilriss des Innenbandes im linken Knie, wie eine Magnetresonanz-Untersuchung am Donnerstag ergab. Murg erhielt eine Schiene, muss aber nicht operiert werden. Dennoch steht ihm eine mehrwöchige Pause bevor.\n']


In [75]:
import spacy
import de_core_news_md

In [92]:
nlp = spacy.load('de_core_news_md')
nlp.max_length = 6000000
dataForTokens = ""


        
def tokenize(data):
    counts = dict()

    for row in data:
        doc = nlp(row[1]) 
        for token in doc:
            counts[token.text] = counts.get(token.text, 0) + 1
    newCounts = dict(sorted(counts.items(), key=lambda item: item[1], reverse=True))
    return list(newCounts.keys())[:5000]

tokenizedTestData = tokenize(filtered_test_data)
tokenizedTrainData = tokenize(filtered_train_data)

In [93]:
word_to_idx = {}
for tag in tokenizedTrainData:
    if tag not in word_to_idx:
        word_to_idx[tag] = len(word_to_idx)   

In [107]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [108]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 256

In [109]:
tag_to_idx = {"Wirtschaft":0, "Sport":1}

In [110]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_idx), len(tag_to_idx))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [111]:
def prepare_sequence(seq, to_ix):
    filtered_seq = list(filter(lambda x:x in to_ix.keys(), seq))
    idxs = [to_ix[w] for w in filtered_seq]
    return torch.tensor(idxs, dtype=torch.long)

articleListTrain = list(map(lambda x:prepare_sequence(x[1], word_to_idx), filtered_train_data))
tagListTrain = list(map(lambda x:x[0], filtered_train_data))

articleListTest = list(map(lambda x:prepare_sequence(x[1], word_to_idx), filtered_test_data))
tagListTest = list(map(lambda x:x[0], filtered_test_data))


def calcAccuracy(articleList, tagList):
    accuracy = 0
    with torch.no_grad():
        for i in range(len(articleList)):
            tag_scores = model(articleList[i])
            idx = torch.argmax(tag_scores[-1]).item()
            if list(tag_to_idx.keys())[idx] == tagList[i]:
                accuracy+=1
    print("accuracy: " + str(accuracy/len(articleList)))

In [112]:
calcAccuracy(articleListTest, tagListTest)

accuracy: 0.5368852459016393


In [113]:
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

In [114]:
import time

print(len(articleListTrain))
for epoch in range(5):
    accuracy = 0
    start_time = time.time()
    i = 0
    for tag, sentence in filtered_train_data:
        model.zero_grad()
        sentence_in = prepare_sequence(sentence, word_to_idx)
        #target = torch.tensor([tag_to_idx[tag]], dtype=torch.long)
        tag_scores = model(sentence_in)
        idx = torch.argmax(tag_scores[-1]).item()
        if list(tag_to_idx.keys())[idx] == tag:
            accuracy+=1
        loss = loss_function(tag_scores.T, torch.tensor(list(tag_to_idx.values()), dtype = torch.long))
        loss.backward()
        optimizer.step()
        sys.stdout.write("\r%d" % i)
        sys.stdout.flush()
        i+=1
    print("epoch: " + str(epoch))
    print("duration: " + str(time.time() - start_time))
    print("trainAccuracy: " + str(accuracy/len(filtered_train_data)))
    print("testAccuracy: ")
    calcAccuracy(articleListTest, tagListTest)     

2154
2153epoch: 0
duration: 1248.9330337047577
trainAccuracy: 0.45496750232126276
testAccuracy: 
accuracy: 0.4672131147540984
2153epoch: 1
duration: 1316.8404638767242
trainAccuracy: 0.4558960074280409
testAccuracy: 
accuracy: 0.4672131147540984
40

KeyboardInterrupt: 