In [None]:
seed = 34

In [None]:
pip install num2words

In [None]:
import json
import pandas as pd
import string
from num2words import num2words
from nltk.stem.snowball import SnowballStemmer

In [None]:
! python -m spacy download ro_core_news_sm

In [None]:
import spacy
nlp = spacy.load("ro_core_news_sm")
stemmer = SnowballStemmer(language='romanian')

In [None]:
def tags_from_json():
    tags = json.loads(open("../input/nitro-lang-processing-1/tag_to_id.json", 'r', encoding='utf8').read())
    return tags

In [None]:
def train_data_from_json():
    df = pd.read_json('../input/nitro-lang-processing-1/train.json')
    df = df[['ner_ids', 'tokens']]
    return df

In [None]:
def test_data_from_json():
    df = pd.read_json('../input/nitro-lang-processing-1/test.json')
    df = df[['tokens']]
    return df

In [None]:
def token_vocab(df):
    vocab = set()
    
    for sentence in df['tokens']:
        for token in sentence:
            vocab.add(token)
    
    tokendict = {tok:(id+2) for id,tok in enumerate(vocab)}
    tokendict['UNK'] = 0
    tokendict['PAD'] = 1
    return tokendict

In [None]:
def transf_data(df, tokendict):
    df['token_ids'] = df['tokens'].apply(lambda x: [tokendict[tok] if tok in tokendict else tokendict['UNK'] for tok in x])
    return df

In [None]:
def preproccess_data(df):
    df['tokens'] = df['tokens'].apply(lambda x: [token.lower() for token in x])
#   df['tokens'] = df['tokens'].apply(lambda x: [token.translate(str.maketrans('', '', string.punctuation)) for token in x])
    df['tokens'] = df['tokens'].apply(lambda x: [num2words(token, lang="ro") if token.isdigit() else token for token in x])
    df['tokens'] = df['tokens'].apply(lambda x: [stemmer.stem(token) for token in x])
    return df

In [None]:
train_df = train_data_from_json()
test_df = test_data_from_json()

train_df = preproccess_data(train_df)
test_df = preproccess_data(test_df)

train_df

In [None]:
tokendict = token_vocab(train_df)
train_df = transf_data(train_df, tokendict)
test_df = transf_data(test_df, tokendict)
tags = tags_from_json()
train_df

In [None]:
max1 = max([len(ls) for ls in train_df['tokens']])
max2 = max([len(ls) for ls in test_df['tokens']])
print(max1, " ", max2, " -> ", max(max1, max2))
max1 = max(max1, max2)

In [None]:
def add_padding(df, maxlen, isTrain=False):
    for i in range(len(df)):
        df.iloc[i]['token_ids'].extend([1] * (maxlen - len(df.iloc[i]['token_ids'])))
        if isTrain is True:
            df.iloc[i]['ner_ids'].extend([0] * (maxlen - len(df.iloc[i]['ner_ids'])))
    return df

In [None]:
train_df = add_padding(train_df, max1, True)
print(train_df)

In [None]:
import tensorflow as tf
import numpy
#from sklearn.preprocessing import StandardScaler
# import torch

print(type(train_df["token_ids"]))
pad_statements = tf.convert_to_tensor (list((train_df["token_ids"])))

# One hot encoding
pad_tags = tf.keras.utils.to_categorical(list(train_df["ner_ids"]))
pad_tags = tf.convert_to_tensor (pad_tags)
print(pad_statements)
print(pad_tags)

In [None]:
# shape = pad_statements[1].shape

input_dim = len(tokendict) + 1
output_dim = 64
input_length = max1
output_units = len(tags)
print(input_length)
print(output_units)
print(input_dim)
# print(shape)
# input_dim = len(l)

In [None]:
import tensorflow
import numpy as np
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, TimeDistributed, Dense, Dropout
from tensorflow.keras.utils import plot_model



input_layer = Input(shape=(max1,))
embeddings = Embedding(input_dim=input_dim, output_dim=output_dim)(input_layer)
lstm1 = LSTM(units=output_dim, return_sequences=True)(embeddings)
lstm2 = LSTM(units=output_dim, return_sequences=True)(lstm1)
output = tensorflow.keras.layers.TimeDistributed(Dense(units=32, activation='relu'))(lstm2)
output1 = tensorflow.keras.layers.TimeDistributed(Dense(units=32, activation='relu'))(output)
output2 = tensorflow.keras.layers.TimeDistributed(Dense(units=64, activation='relu'))(output1)
output3 = tensorflow.keras.layers.TimeDistributed(Dense(units=output_units, activation='softmax'))(output2)
model = tensorflow.keras.Model(inputs=input_layer, outputs=output3)
# embeddings

In [None]:
# model.compile(optimizer=tf.keras.optimizers.Adam(), loss='categorical_crossentropy', metrics=['acc'])
model.compile(optimizer=tf.keras.optimizers.RMSprop(1e-3), loss='categorical_crossentropy', metrics=['acc'])
plot_model(model, show_shapes=True)

In [None]:
model.fit(x=pad_statements, y=pad_tags, verbose = 1, epochs = 8, batch_size = 4)

In [None]:
import tensorflow as tf
import numpy

# prelucrate test_data
test_df = add_padding(test_df, max1)
print(type(train_df["token_ids"]))
test_pad_statements = tf.convert_to_tensor (list((test_df["token_ids"])))

print(test_pad_statements)
# print(pad_tags)


In [None]:
result = model.predict(test_pad_statements)
print(result)

In [None]:
def format_result(result, test_pad_statements):
    classes = []
    for i in range(len(test_pad_statements)):
        for j in range(len(test_pad_statements[i])):
            if test_pad_statements[i][j] == 1:
                break
            classes.append(np.argmax(result[i][j]))
    
    
    data = [[id, label] for id,label in enumerate(classes)]
    df = pd.DataFrame(data, columns=['Id', 'ner_label'])
    sub = df.to_csv('sub.csv', index=False)
    
    return df


In [None]:
result2 = format_result(result, test_pad_statements)
print(result2)