In [18]:
import re
import numpy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
import _pickle as cPickle
import nltk

In [19]:

filename = [
            'Urdu Tagged Corpus/00ur_pos.txt',
            'Urdu Tagged Corpus/01ur_pos.txt',
            'Urdu Tagged Corpus/02ur_pos.txt',
            ]
data=[]
for f in filename:
    file = open(f, "r")
    for line in file:
        data.append(line)

In [20]:
a = [tuple(e.split())for e in data]

In [21]:
tagged_sentences=[]
for i in range(len(a)):
    d=[]
    for e in a[i]:
        pos = re.findall('(?<=\<).*?(?=\>)', e)
        for p in pos:
            word = (e.replace(f'<{p}>',''))
        if len(word)>0 and len(pos)>0:
            d.append((word,' '.join(pos)))
    tagged_sentences.append(d)

In [22]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]

In [23]:
def features(sentence, index):
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
    }

In [24]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y
 
X, y = transform_to_dataset(tagged_sentences)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [26]:
clf = Pipeline([
    ('vectorizer', DictVectorizer()),
    ('classifier', RandomForestClassifier(n_estimators = 20, random_state = 43))
    ])
 
clf.fit(X_train,y_train)
print("Accuracy:", clf.score(X_test, y_test))

Accuracy: 0.8391043323056953


In [None]:
pickle_out = open("RandomForest.pickle","wb")
cPickle.dump(clf, pickle_out)
pickle_out.close()

In [None]:
pickle_out = open("RandomForest.pickle","rb")
clf1=cPickle.load(pickle_out)
pickle_out.close()

In [None]:
print("Accuracy:", clf1.score(X_test, y_test))

In [None]:
def tagger(sentence):
    tags = clf1.predict([features(sentence, index) for index in range(len(sentence))])
    return zip(sentence, tags)

In [16]:
a = tagger(nltk.word_tokenize('نے کل مجلس قانون‌ساز کی بحث کے دوران کہا ۔'))

In [17]:
for e in a :
    print(e)

('نے', 'CM')
('کل', 'NN')
('مجلس', 'NN')
('قانون\u200cساز', 'NN')
('کی', 'CM')
('بحث', 'NN')
('کے', 'CM')
('دوران', 'NNCM')
('کہا', 'VB')
('۔', 'SM')
