<a href="https://colab.research.google.com/github/MahimaGaikwad/POS-Tagger-for-Urdu-language/blob/master/UrduPOS2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
import codecs

tagged_sentences = codecs.open("/content/gdrive/My Drive/Colab Notebooks/data_lstm.txt", encoding="utf-8").readlines()
print(tagged_sentences[0])

import ast
import numpy as np
from keras.layers import Dense, InputLayer, Embedding, Activation
from keras.models import Sequential
from keras.optimizers import Adam
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

[('’', 'PN'), ('میرے', 'G'), ('بھائی', 'NN'), ('کا', 'P'), ('ای', 'PN'), ('میل', 'U'), ('آیاہے', 'VB'), ('۔', 'SM')]



Using TensorFlow backend.


In [0]:
def logits_to_tokens(sequences, index):
    token_sequences = []
    for categorical_sequence in sequences:
        token_sequence = []
        for categorical in categorical_sequence:
            token_sequence.append(index[np.argmax(categorical)])

        token_sequences.append(token_sequence)

    return token_sequences

In [0]:
def to_categorical(sequences, categories):
    cat_sequences = []
    for s in sequences:
        cats = []
        for item in s:
            cats.append(np.zeros(categories))
            cats[-1][item] = 1.0
        cat_sequences.append(cats)
    return np.array(cat_sequences)


In [0]:
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*ast.literal_eval(tagged_sentence))
    sentences.append(np.array(sentence))
    sentence_tags.append(np.array(tags))

In [0]:
(train_sentences,
 test_sentences,
 train_tags,
 test_tags) = train_test_split(sentences, sentence_tags, test_size=0.2)

In [0]:
words, tags = set([]), set([])

for s in train_sentences:
    for w in s:
        words.add(w.lower())

for ts in train_tags:
    for t in ts:
        tags.add(t)


In [0]:
word2index = {w: i + 2 for i, w in enumerate(list(words))}
word2index['-PAD-'] = 0  # The special value used for padding
word2index['-OOV-'] = 1  # The special value used for OOVs

tag2index = {t: i + 1 for i, t in enumerate(list(tags))}
tag2index['-PAD-'] = 0  # The special value used to padding


In [0]:
train_sentences_X, test_sentences_X, train_tags_y, test_tags_y = [], [], [], []

In [0]:
for s in train_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])

    train_sentences_X.append(s_int)

for s in test_sentences:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])

    test_sentences_X.append(s_int)


In [0]:
for s in train_tags:
    train_tags_y.append([tag2index[t] for t in s])

for s in test_tags:
    test_tags_y.append([tag2index[t] for t in s])

MAX_LENGTH = len(max(train_sentences_X, key=len))

In [0]:
train_sentences_X = pad_sequences(train_sentences_X, maxlen=MAX_LENGTH, padding='post')
test_sentences_X = pad_sequences(test_sentences_X, maxlen=MAX_LENGTH, padding='post')
train_tags_y = pad_sequences(train_tags_y, maxlen=MAX_LENGTH, padding='post')
# train_tags_y = keras.utils.to_categorical(train_tags_, len(tag2index))
test_tags_y = pad_sequences(test_tags_y, maxlen=MAX_LENGTH, padding='post')
# test_tags_y = keras.utils.to_categorical(test_tags_, len(tag2index))

In [14]:
model = Sequential()
model.add(InputLayer(input_shape=(MAX_LENGTH,)))
model.add(Embedding(len(word2index), 128))
model.add(Dense(128))
model.add(Dense(len(tag2index)))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])
model.fit(train_sentences_X, to_categorical(train_tags_y, len(tag2index)), batch_size=32, epochs=10, validation_split=0.2)

model.summary()


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 23240 samples, validate on 5811 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 281, 128)          5315200   
_________________________________________________________________
dense_1 (Dense)              (None, 281, 128)          16512     
_________________________________________________________________
dense_2 (Dense)              (None, 281, 41)           5289      
_________________________________________________________________
activation_1 (Activation)    (None, 281, 41)           0         
Total params: 5,337,001
Trainable params: 5,337,001
Non-trainable params: 0
_________________________________________________________________


In [15]:
print("Original data from test samples")
print(test_sentences[0])
print(test_tags[0])

Original data from test samples
['اس' 'کی' 'قبر' 'کہاں' 'ہے' 'وہ' 'کب' 'اور' 'کیسے' 'دفن' 'ہوا' 'اس' 'کے'
 'متعلق' 'صغیر' 'کے' 'جاننے' 'والوں' 'نے' 'کچھ' 'نہ' 'بتایا' '،' 'یہ' 'ان'
 'کے' 'علم' 'میں' 'نہیں' 'تھا' '،' 'امتیاز' 'کو' 'یقین' 'ہو' 'گیا' 'کہ'
 'اس' 'کے' 'خاوند' 'نے' 'خودکشی' 'کر' 'لی' 'ہے' '،' 'اس' 'کو' 'شادی' 'اس'
 'کا' 'سبب' 'معلوم' 'تھا' '،' 'مگر' 'اس' 'کا' 'باپ' 'یہ' 'ماننے' 'سے'
 'یکسر' 'منکر' 'تھا' '،' 'چنانچہ' 'اس' 'نے' 'ایک' 'بار' 'اپنی' 'بیٹی' 'سے'
 'کہا' 'میرا' 'دل' 'کہتا' 'ہے' 'وہ' 'زندہ' 'ہے' '،' 'وہ' 'تمہاری' 'محبت'
 'کی' 'خاطر' 'اس' 'وقت' 'تک' 'زندہ' 'رہے' 'گا' 'جب' 'تک' 'خدا' 'اس' 'کو'
 'موت' 'کے' 'فرشتے' 'کے' 'حوالے' 'نہ' 'کر' 'دے' '،' 'میں' 'اس' 'کو' 'اچھی'
 'طرح' 'سمجھتا' 'تمہاری' 'جگہ' 'اگر' 'وہ' 'میرا' 'بیٹا' 'ہوتا' 'تو' 'میں'
 'خود' 'کو' 'دنیا' 'کا' 'سب' 'سے' 'خوش' 'نصیب' 'انسان' 'سمجھتا' '،' 'یہ'
 'سن' 'کر' 'امتیاز' 'خاموش' 'رہی' '۔']
['PP' 'P' 'NN' 'AKP' 'VB' 'PP' 'AKP' 'CC' 'AKP' 'VB' 'VB' 'PP' 'P' 'NN'
 'NN' 'P' 'VB' 'WALA' 'P' 'Q' 'NEG' 'VB' 'PM' 'PP

In [16]:
scores = model.evaluate(test_sentences_X, to_categorical(test_tags_y, len(tag2index)))
print(f"{model.metrics_names[1]}: {scores[1] * 100}")   # acc: 98.39311069478103

accuracy: 98.69886040687561


In [17]:
from keras.models import load_model
model_json = model.to_json()
with open("/content/gdrive/My Drive/Colab Notebooks/urduPOS.json", "w") as json_file:
  json_file.write(model_json)
model.save("/content/gdrive/My Drive/Colab Notebooks/urduPOS.h5")
print("Savedmodel to drive")

Savedmodel to drive


In [0]:
test_samples = [
    test_sentences[0]
]

test_samples_X = []
for s in test_samples:
    s_int = []
    for w in s:
        try:
            s_int.append(word2index[w.lower()])
        except KeyError:
            s_int.append(word2index['-OOV-'])
    test_samples_X.append(s_int)

test_samples_X = pad_sequences(test_samples_X, maxlen=MAX_LENGTH, padding='post')

In [19]:
predictions = model.predict(test_samples_X)
print(logits_to_tokens(predictions, {i: t for t, i in tag2index.items()}))

[['PP', 'P', 'NN', 'AKP', 'TA', 'PP', 'AKP', 'CC', 'AKP', 'NN', 'VB', 'PP', 'P', 'NN', 'ADJ', 'P', 'VB', 'WALA', 'P', 'Q', 'NEG', 'VB', 'PM', 'PP', 'PP', 'P', 'NN', 'P', 'NEG', 'TA', 'PM', 'NN', 'P', 'NN', 'VB', 'AA', 'SC', 'PP', 'P', 'NN', 'P', 'NN', 'VB', 'AA', 'TA', 'PM', 'PP', 'P', 'NN', 'PP', 'P', 'NN', 'ADJ', 'TA', 'PM', 'SC', 'PP', 'P', 'NN', 'PP', 'VB', 'SE', 'ADJ', 'NN', 'TA', 'PM', 'SC', 'PP', 'P', 'CA', 'NN', 'GR', 'NN', 'SE', 'VB', 'G', 'NN', 'VB', 'TA', 'PP', 'ADJ', 'TA', 'PM', 'PP', 'G', 'NN', 'P', 'NN', 'PP', 'NN', 'P', 'ADJ', 'AA', 'TA', 'AP', 'P', 'NN', 'PP', 'P', 'NN', 'P', 'VB', 'P', 'NN', 'NEG', 'VB', 'VB', 'PM', 'P', 'PP', 'P', 'ADJ', 'NN', 'VB', 'G', 'NN', 'SC', 'PP', 'G', 'NN', 'VB', 'SC', 'P', 'RP', 'P', 'NN', 'P', 'Q', 'SE', 'ADJ', 'NN', 'NN', 'VB', 'PM', 'PP', 'VB', 'VB', 'NN', 'ADJ', 'AA', 'SM', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PAD-', '-PA