In [1]:
import os
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
import xml.etree.ElementTree as ET

In [2]:
class CleanAfsana(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, afsanas):
        stopwords = ["آئی","آئے","آج","آخر","آخرکبر","آدهی","آًب","آٹھ","آیب","اة","اخبزت","اختتبم","ادھر","ارد","اردگرد","ارکبى","اش","اضتعوبل","اضتعوبلات","اضطرذ","اضکب","اضکی","اضکے","اطراف","اغیب","افراد","الگ","اور","اوًچب","اوًچبئی","اوًچی","اوًچے","اى","اً","اًذر","اًہیں","اٹھبًب","اپٌب","اپٌے","اچھب","اچھی","اچھے","اکثر","اکٹھب","اکٹھی","اکٹھے","اکیلا","اکیلی","اکیلے","اگرچہ","اہن","ایطے","ایک","ب","ت","تبزٍ","تت","تر","ترتیت","تریي","تعذاد","تن","تو","توبم","توہی","توہیں","تٌہب","تک","تھب","تھوڑا","تھوڑی","تھوڑے","تھی","تھے","تیي","ثب","ثبئیں","ثبترتیت","ثبری","ثبرے","ثبعث","ثبلا","ثبلترتیت","ثبہر","ثدبئے","ثرآں","ثراں","ثرش","ثعذ","ثغیر","ثلٌذ","ثلٌذوثبلا","ثلکہ","ثي","ثٌب","ثٌبرہب","ثٌبرہی","ثٌبرہے","ثٌبًب","ثٌذ","ثٌذکرو","ثٌذکرًب","ثٌذی","ثڑا","ثڑوں","ثڑی","ثڑے","ثھر","ثھرا","ثھراہوا","ثھرپور","ثھی","ثہت","ثہتر","ثہتری","ثہتریي","ثیچ","ج","خب","خبرہب","خبرہی","خبرہے","خبهوظ","خبًب","خبًتب","خبًتی","خبًتے","خبًٌب","خت","ختن","خجکہ","خص","خططرذ","خلذی","خو","خواى","خوًہی","خوکہ","خٌبة","خگہ","خگہوں","خگہیں","خیطب","خیطبکہ","در","درخبت","درخہ","درخے","درزقیقت","درضت","دش","دفعہ","دلچطپ","دلچطپی","دلچطپیبں","دو","دور","دوراى","دوضرا","دوضروں","دوضری","دوضرے","دوًوں","دکھبئیں","دکھبتب","دکھبتی","دکھبتے","دکھبو","دکھبًب","دکھبیب","دی","دیب","دیتب","دیتی","دیتے","دیر","دیٌب","دیکھو","دیکھٌب","دیکھی","دیکھیں","دے","ر","راضتوں","راضتہ","راضتے","رریعہ","رریعے","رکي","رکھ","رکھب","رکھتب","رکھتبہوں","رکھتی","رکھتے","رکھی","رکھے","رہب","رہی","رہے","ز","زبصل","زبضر","زبل","زبلات","زبلیہ","زصوں","زصہ","زصے","زقبئق","زقیتیں","زقیقت","زکن","زکویہ","زیبدٍ","صبف","صسیر","صفر","صورت","صورتسبل","صورتوں","صورتیں","ض","ضبت","ضبتھ","ضبدٍ","ضبرا","ضبرے","ضبل","ضبلوں","ضت","ضرور","ضرورت","ضروری","ضلطلہ","ضوچ","ضوچب","ضوچتب","ضوچتی","ضوچتے","ضوچو","ضوچٌب","ضوچی","ضوچیں","ضکب","ضکتب","ضکتی","ضکتے","ضکٌب","ضکی","ضکے","ضیذھب","ضیذھی","ضیذھے","ضیکٌڈ","ضے","طرف","طریق","طریقوں","طریقہ","طریقے","طور","طورپر","ظبہر","ع","عذد","عظین","علاقوں","علاقہ","علاقے","علاوٍ","عووهی","غبیذ","غخص","غذ","غروع","غروعبت","غے","فرد","فی","ق","قجل","قجیلہ","قطن","لئے","لا","لازهی","لو","لوجب","لوجی","لوجے","لوسبت","لوسہ","لوگ","لوگوں","لڑکپي","لگتب","لگتی","لگتے","لگٌب","لگی","لگیں","لگے","لی","لیب","لیٌب","لیں","لے","ه","هتعلق","هختلف","هسترم","هسترهہ","هسطوش","هسیذ","هطئلہ","هطئلے","هطبئل","هطتعول","هطلق","هعلوم","هػتول","هلا","هوکي","هوکٌبت","هوکٌہ","هٌبضت","هڑا","هڑًب","هڑے","هکول","هگر","هہرثبى","هیرا","هیری","هیرے","هیں","و","وار","والے","وٍ","ًئی","ًئے","ًب","ًبپطٌذ","ًبگسیر","ًطجت","ًقطہ","ًو","ًوخواى","ًکبلٌب","ًکتہ","ًہ","ًہیں","ًیب","ًے","ٓ آش","ٹھیک","پبئے","پبش","پبًب","پبًچ","پر","پراًب","پطٌذ","پل","پورا","پوچھب","پوچھتب","پوچھتی","پوچھتے","پوچھو","پوچھوں","پوچھٌب","پوچھیں","پچھلا","پھر","پہلا","پہلی","پہلےضی","پہلےضے","پہلےضےہی","پیع","چبر","چبہب","چبہٌب","چبہے","چلا","چلو","چلیں","چلے","چکب","چکی","چکیں","چکے","چھوٹب","چھوٹوں","چھوٹی","چھوٹے","چھہ","چیسیں","ڈھوًڈا","ڈھوًڈلیب","ڈھوًڈو","ڈھوًڈًب","ڈھوًڈی","ڈھوًڈیں","ک","کئی","کئے","کب","کبفی","کبم","کت","کجھی","کرا","کرتب","کرتبہوں","کرتی","کرتے","کرتےہو","کررہب","کررہی","کررہے","کرو","کرًب","کریں","کرے","کطی","کل","کن","کوئی","کوتر","کورا","کوروں","کورٍ","کورے","کوطي","کوى","کوًطب","کوًطی","کوًطے","کھولا","کھولو","کھولٌب","کھولی","کھولیں","کھولے","کہ","کہب","کہتب","کہتی","کہتے","کہو","کہوں","کہٌب","کہی","کہیں","کہے","کی","کیب","کیطب","کیطرف","کیطے","کیلئے","کیوًکہ","کیوں","کیے","کے","کےثعذ","کےرریعے","گئی","گئے","گب","گرد","گروٍ","گروپ","گروہوں","گٌتی","گی","گیب","گے","ہر","ہن","ہو","ہوئی","ہوئے","ہوا","ہوبرا","ہوبری","ہوبرے","ہوتب","ہوتی","ہوتے","ہورہب","ہورہی","ہورہے","ہوضکتب","ہوضکتی","ہوضکتے","ہوًب","ہوًی","ہوًے","ہوچکب","ہوچکی","ہوچکے","ہوگئی","ہوگئے","ہوگیب","ہوں","ہی","ہیں","ہے","ی","یقیٌی","یہ","یہبں"]
        afsana_list_clean = []
        for afsana in afsanas: 
            s_list = [word for word in afsana.split() if word not in stopwords]
            str_ = ' '.join(s_list)
            afsana_list_clean.append(str_)
        return afsana_list_clean

In [3]:
class POSCountVectorizer(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, afsanas):
        pos_dict = {}
        files = os.listdir('ur_ds\\pos')
        for file in files:
            tree = ET.parse('ur_ds\\pos\\' + file)
            root = tree.getroot()
            for sent in root:
                for word in sent:
                    pos_dict[word.text] = word.attrib['pos']
        
        afsana_pos_features = []
        for afsana in afsanas:
            tag_indexes = {'NN': 0, 'VB': 1, 'ADJ': 2, 'ADV': 3, 'PN': 4, 'PP': 5}
            pos_count_arr = [0] * len(tag_indexes)
            afsana_words = afsana.split()
            for word in afsana_words:
                if word in pos_dict:
                    pos_tag = pos_dict[word]
                    if pos_tag and pos_tag in tag_indexes :
                        pos_count_arr[tag_indexes[pos_tag]] += 1
            afsana_pos_features.append(pos_count_arr)
        return afsana_pos_features

In [4]:
class POSTransformer(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, afsanas):
        pos_dict = {}
        files = os.listdir('ur_ds\\pos')
        for file in files:
            tree = ET.parse('ur_ds\\pos\\' + file)
            root = tree.getroot()
            for sent in root:
                for word in sent:
                    pos_dict[word.text] = word.attrib['pos']
        afsana_pos = []
        for afsana in afsanas:
            pos_tag_str = ''
            afsana_words = afsana.split()
            for word in afsana_words:
                if word in pos_dict:
                    pos_tag = pos_dict[word]
                    if pos_tag:
                        pos_tag_str += ' ' + pos_tag
            afsana_pos.append(pos_tag_str.strip())
        return afsana_pos

In [5]:
pipeline = Pipeline([
    ('cleanafsana', CleanAfsana()),
    ('features', FeatureUnion([
        ('tfidfpipe',  Pipeline([
            ('tfidf', TfidfVectorizer(analyzer='word', max_df=0.9, min_df=0.2, ngram_range=(2,3),max_features=6000))
        ])),
        ('pos_tfidf', Pipeline([
            ('pos', POSTransformer()),
            ('tfidf', TfidfVectorizer(analyzer='word', max_df=0.9, min_df=0.2, ngram_range=(2,3),max_features=6000))
          ]))
    ]))
])

In [6]:
folders = os.listdir('ur_ds\\clean')
labels = []
afsana_list = []
for folder in folders:
    files = os.listdir('ur_ds\\clean\\' + folder)
    for file in files:
        labels.append(folder)
        f = open('ur_ds\\clean\\' + folder + '\\' + file, "r", encoding="utf8")
        for line in f:
            if (len(line) > 2):
                afsana_list.append(line)
        f.close()

In [7]:
features = pipeline.fit_transform(afsana_list)


In [8]:
features_nd = features.toarray()

In [9]:
features_nd.shape

(98, 1664)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(features_nd, labels, train_size=0.75,random_state=1234)



In [11]:
clf3 = XGBClassifier()
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
accuracy_score(y_test, y_pred)

0.76

In [12]:
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
accuracy_score(y_test, y_pred)

0.84

In [13]:
clf2 = svm.SVC(kernel='linear')
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
accuracy_score(y_test, y_pred)

0.96