In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import string

from util import randomChoice, lineToTensor, readFile

import math
import random
import glob
import os

import codecs

import time

# import matplotlib for plotting 
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

all_letters = string.ascii_letters + " .,;'" + "äÄüÜöÖß"
n_letters = len(all_letters)

device = torch.device("cuda")

In [15]:
class KCN(nn.Module):
    def __init__(self, output_size, time_steps=20):
        super(KCN, self).__init__()
    
        self.time_steps = time_steps
        self.lstm = nn.LSTM(1,128, bidirectional=True, num_layers=2)
        self.linear = nn.Linear(128*2*time_steps, output_size)
        self.softmax = nn.LogSoftmax(dim = 1)

    def forward(self, x):        
        lstm_out, _ = self.lstm(x)
        tag_space = self.linear(lstm_out.view(-1,128*2*self.time_steps))
        return tag_space

In [18]:
inputs = []
for filename in glob.glob('../Produktlisten/*.txt'):
    lines = readFile(filename) 
    inputs.append(lines)
inputs = [_ for i in range(len(inputs)) for _ in inputs[i]]

In [19]:
def pad_sequence(name, wordList, max_len=20):
    x = np.zeros((len(name), max_len))
    for i, name in enumerate(wordList):
        for j, c in enumerate(name):
            if j >= max_len:
                break
            x[i,j] = ord(c)
    return torch.FloatTensor(x)

In [20]:
german_word_list = open("../Produktlisten/german/german.dic", 'r', encoding='utf-8',
                 errors='ignore').read().strip().upper().split('\n')
for word in inputs:
    try:
        german_word_list.remove(word)
    except:
          continue
print(len(german_word_list))

2055185


In [21]:
x = pad_sequence(inputs + german_word_list[:1000],inputs + german_word_list[:1000]).to(device)
x = torch.unsqueeze(x, dim=2)
y = torch.LongTensor([1 for x in range(len(inputs))] + [0 for x in range(1000)]).to(device)

In [22]:
print(x.shape)
print(y.shape)

torch.Size([1308, 20, 1])
torch.Size([1308])


In [23]:
model = KCN(2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), 0.01)

for epoch in range(100):
    model.train()
    output = model(x)
    loss = criterion(output, y)
    print (f"[{epoch}] Train Loss: {loss.item()}", end="\r")
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

[99] Train Loss: 9.757062798598781e-058

In [24]:
model.eval()

KCN(
  (lstm): LSTM(1, 128, num_layers=2, bidirectional=True)
  (linear): Linear(in_features=5120, out_features=2, bias=True)
  (softmax): LogSoftmax(dim=1)
)

In [36]:
def predict(word):
    probs = model(pad_sequence([word.upper()],[word.upper()]).view(1,20,1).to(device))
    print(probs)
    if probs[0][0] > probs[0][1]:
        return 0
    else:
        return 1

In [37]:
predict("fjheöLO sBDJK")

tensor([[ 0.1017, -0.0932]], device='cuda:0', grad_fn=<AddmmBackward>)


0

In [38]:
torch.save(model.state_dict(), "kcn.pth")