In [2]:
import os
import operator
import re
import codecs
import json

def getWords(text):
    return re.compile('\w+').findall(text)

def buildModel():
    '''
    This model uses zipf's law to help decide the category of document;
    This model contains the most 50 frequent occurance N-grams in each language.
    Return: uniGramModel, biGramModel, triGramModel
    '''
    langList = []
    #Jupyter network file path:./Desktop/NLP/txt/
    #Need to modify path when run on your own environment.
    path = './Desktop/NLP/txt/'
    for name in os.listdir(path):
        if len(name) == 2:
            langList.append(name)
    uniGramodel = {}
    biGramodel = {}
    triGramodel = {}
    for lang in langList:
        print("Now processing " + lang+"...")
        uniGramodel[lang] = {}
        biGramodel[lang] = {}
        triGramodel[lang] = {}
        for filename in os.listdir(path+lang+"/"):
            with codecs.open(path+lang+"/"+filename, "r", encoding='utf-8', errors='ignore') as fdata:
                for line in fdata:
                    #print(type(line))
                    #print("Program is running")
                    if line[0] == '<':
                        continue
                    phrases = line.split('.')
                    for phra in phrases:
                        wordList = getWords(phra)
                        for i,word in enumerate(wordList):
                            word = word.strip()
                            if i == 0:
                                biGram = ("",word)
                                if i+1 < len(wordList):
                                    triGram = ("",word,wordList[i+1].strip())
                                else:
                                    triGram = None
                            elif i == len(wordList)-1:
                                biGram = (word,"")
                                triGram = (wordList[i-1].strip(),word,"")
                            else:
                                biGram = (word,wordList[i+1])
                                if i+2 < len(wordList):
                                    triGram = (wordList[i],wordList[i+1],wordList[i+2])
                                else:
                                    triGram = None
                            if word not in uniGramodel[lang]:
                                uniGramodel[lang][word] = 1
                            else:
                                uniGramodel[lang][word] += 1
                            if biGram not in biGramodel[lang]:
                                biGramodel[lang][biGram] = 1
                            else:
                                biGramodel[lang][biGram] += 1
                            if triGram != None:
                                if triGram not in triGramodel[lang]:
                                    triGramodel[lang][triGram] = 1
                                else:
                                    triGramodel[lang][triGram] += 1
    uniGramList = {}
    biGramList = {}
    triGramList = {}
    for lang in langList:
        uniGramodel[lang] = sorted(uniGramodel[lang].items(), key = operator.itemgetter(1), reverse = True)
        biGramodel[lang] = sorted(biGramodel[lang].items(), key = operator.itemgetter(1), reverse = True)
        triGramodel[lang] = sorted(triGramodel[lang].items(), key = operator.itemgetter(1), reverse = True)
        uniGramList[lang] = []
        biGramList[lang] = []
        triGramList[lang] = []
        for i in range(50):
            uniGramList[lang].append(uniGramodel[lang][i][0])
            biGramList[lang].append(biGramodel[lang][i][0])
            triGramList[lang].append(triGramodel[lang][i][0])
    print("N-Gram Model building finshied")

    return uniGramList,biGramList,triGramList,langList

#uni,bi,tri = buildModel()
def getLabel(uniGramList, biGramList, triGramList, document,langList):
    '''
    This function is to calculate the distance from document to all possible languages.
    The document will be labeled as the shortest distance language.
    Params:
    uniGramList, biGramList, triGramList, document
    Return:
    document language label
    '''
    phrases = document.split('.')
    docUni = {}
    docBi = {}
    docTri = {}
    for phra in phrases:
        wordList = getWords(phra)
        for i,word in enumerate(wordList):
            word = word.strip()
            if i == 0:
                biGram = ("", word)
                if i + 1 < len(wordList):
                    triGram = ("", word, wordList[i + 1].strip())
                else:
                    triGram = None
            elif i == len(wordList) - 1:
                biGram = (word, "")
                triGram = (wordList[i - 1].strip(), word, "")
            else:
                biGram = (word, wordList[i + 1])
                if i + 2 < len(wordList):
                    triGram = (wordList[i], wordList[i + 1], wordList[i + 2])
                else:
                    triGram = None
            if word not in docUni:
                docUni[word] = 1
            else:
                docUni[word] += 1
            if biGram not in docBi:
                docBi[biGram] = 1
            else:
                docBi[biGram] += 1
            if triGram != None:
                if triGram not in docTri:
                    docTri[triGram] = 1
                else:
                    docTri[triGram] += 1
    docUni = sorted(docUni.items(), key = operator.itemgetter(1), reverse = True)
    docBi = sorted(docBi.items(), key=operator.itemgetter(1), reverse=True)
    docTri = sorted(docTri.items(), key=operator.itemgetter(1), reverse=True)
    disUni = {}
    disBi = {}
    disTri = {}
    score = {}
    for lang in langList:
        disUni[lang] = 0
        disBi[lang] = 0
        disTri[lang] = 0
        for i in range(len(docUni)):
            if docUni[i][0] in uniGramList[lang]:
                disUni[lang] += abs(uniGramList[lang].index(docUni[i][0])-i)
            else:
                disUni[lang] += 50
        for i in range(len(docBi)):
            if docBi[i][0] in biGramList[lang]:
                disBi[lang] += abs(biGramList[lang].index(docBi[i][0])-i)
            else:
                disBi[lang] += 50
        for i in range(len(docTri)):
            if docTri[i][0] in triGramList[lang]:
                disTri[lang] += abs(triGramList[lang].index(docTri[i][0])-i)
            else:
                disTri[lang] += 50
        score[lang] = disUni[lang]+disBi[lang]+disTri[lang]
    score = sorted(score.items(), key = operator.itemgetter(1))
    return score[0][0]


def main():
    uni,bi,tri,lang = buildModel()
    correct = 0
    count = 0
    with codecs.open("./PycharmProjects/LanguageCat/europarl.test", "r", encoding='utf-8', errors='ignore') as fdata:
        for i,line in enumerate(fdata):
            doc = line[3:]
            label = getLabel(uni,bi,tri,doc,lang)
            if label == line[0:2]:
                correct += 1
            count += 1
    print("The accuracy is: "+str(correct/count))

if __name__ == "__main__":
    main()

Now processing sl...


Now processing sk...


Now processing pl...


Now processing sv...


Now processing da...


Now processing el...


Now processing lv...


Now processing it...


Now processing cs...


Now processing ro...


Now processing pt...


Now processing hu...


Now processing nl...


Now processing bg...


Now processing de...


Now processing fi...


Now processing fr...


Now processing es...


Now processing et...


Now processing en...


Now processing lt...
