In [1]:
from itertools import chain

In [2]:
with open('brown.txt','r') as ang, open('lalka-tom-pierwszy.txt','r',encoding='utf8') as pol:
    ANG = {i.lower() for i in ang.read().split() if i.isalpha()}
    POL = {i.lower() for i in pol.read().split() if i.isalpha()}

In [3]:
class LanguageGuesser:
    def __init__(self,pol_words:set = POL,eng_words:set = ANG,N:int = 2):
        self.max_gram = N
        self.all_words = pol_words | eng_words
        self.every_n_gram_list = self.buildable_n_gram()
        self.pol_freq = self.lang_gram_freq(pol_words)
        self.eng_freq = self.lang_gram_freq(eng_words)
        self.languages_freq_compare = {gram:tuple(map(lambda x:self.freq(x,gram),[self.pol_freq,self.eng_freq]))for gram in self.every_n_gram_list}

    def N_gram(self,word:str,n:int = 1):
        tokens = [token for token in word]
        n_grams = zip(*[tokens[i:] for i in range(n)])
        n_grams = [''.join(i) for i in n_grams]
        return n_grams

    def every_N_gram(self,word:str):
        return list(chain.from_iterable([self.N_gram(word,i) for i in range(1,self.max_gram+1)]))

    def buildable_n_gram(self):
        t = set()
        for word in self.all_words:
            for gram in self.every_N_gram(word):
                t.add(gram)
        return t
    
    def lang_gram_freq(self,lang_set:set):
        lang_freq = {}
        for word in lang_set:
            grams = self.every_N_gram(word)
            for gram in grams:
                if gram in lang_freq:
                    lang_freq[gram] += 1
                else:
                    lang_freq[gram] = 1
        return {gr:lang_freq[gr]/len(lang_set) for gr in lang_freq}
    
    def freq(self,dct:dict,gram:str):
        return dct[gram] if gram in dct else 0
    
    def word_score(self,word:str)->float:
        pass
    
    def test_set_acc(self,test_set:dict)->float:
        return sum([1 for i in test_set if self.guess_lang(word = i) == test_set[i]])/len(test_set)
    
    def guess_lang_all(self,words_to_guess:list)->dict[str,str]:
        return {i:self.guess_lang(word = i) for i in words_to_guess}
    
    def guess_lang(self,*,word:str = None,words:list = [])->str:
        if words:
            return self.guess_lang_all(words)
        return 'pol' if self.word_score(word) >= 0 else 'ang'
    

In [4]:
class BinaryWordScore(LanguageGuesser):
    def __init__(self, pol_words:set = POL, eng_words:set = ANG, N:int = 2):
        super().__init__(pol_words, eng_words, N)
        
    def word_score(self,word:str)->float:
        super().word_score(word)
        score = 0
        for t in self.every_N_gram(word):
            if t not in self.languages_freq_compare:
                continue
            p,e = self.languages_freq_compare[t]
            if p == 0:
                return -1
            if e == 0:
                return 1
            else:
                if p == e:continue
                score += 1 if p>e else -1
        return score
    

In [5]:
with open('do_wprawki.txt','r',encoding='utf8') as data:
    data = data.read().splitlines()
    test_set = {word.split()[1]:word.split()[0] for word in data}
    first_guess = BinaryWordScore(N=5)

In [6]:
first_guess.test_set_acc(test_set)
#first_guess.guess_lang(words = ['inaudiable','extinguish','operacja'])

0.9314086417360976