In [77]:
from os import linesep
import string
from collections import Counter
import numpy as np

dataset = {'de-en-de': 'europarl-v7.de-en.lc.de',
          'de-en-en': 'europarl-v7.de-en.lc.en',
          'fr-en-fr': 'europarl-v7.fr-en.lc.fr',
          'fr-en-en': 'europarl-v7.fr-en.lc.en',
          'sv-en-sv': 'europarl-v7.sv-en.lc.sv',
          'sv-en-en': 'europarl-v7.sv-en.lc.en',}

class text_parser:
    def __init__(self, data = dataset):
        self.data = data
        self.sentences = []
        self.words = {}
        self.keys = []

    def parse(self, keys):
        self.keys = keys
        for key in keys:
            with open(self.data[key], 'r') as file:
                s = file.read()
                sents = s.split(" .\n")
                for sent in sents:
                    self.sentences.append(sent)
                words =  s.split(" ")
                self.words[key] = [word for word in words if word not in string.punctuation]
        
    def count(self, n = 10):
        for key in self.keys:
            most_common = Counter(self.words[key]).most_common(n)
            print('\nFor dataset: '+str(key) +', the most common words are:')
            print(*most_common, sep = "\n")
    
    def get_words(self, all_words = True, unique = False):
        if all_words: 
            all_words = [word for words in self.words.values() for word in words]
            if unique: return np.unique(all_words)
            else: return all_words
        else: return self.words
    
    def get_sent(self, n = -1):
        if n == -1: return self.sentences
        else: return self.sentences[:n]
  
    def prob_words(self,word_list):
        all_words = self.get_words(all_words = True)
        C = Counter(all_words)
        for word in word_list:
            if C[word] == 0: print(word+" is not used in: "+ str(self.keys))
            else: print("Probability of: "+ word +" in "+str(self.keys)+ " is: "+str(C[word]/C.total()))

   

In [66]:
parser = text_parser()
parser.parse(['de-en-de', 'de-en-en'])
parser.count()


For dataset: de-en-de, the most common words are:
('die', 9649)
('der', 9139)
('und', 6920)
('in', 3934)
('zu', 3136)
('den', 2955)
('daß', 2725)
('von', 2448)
('für', 2432)
('ist', 2259)

For dataset: de-en-en, the most common words are:
('the', 18696)
('of', 9553)
('to', 9029)
('and', 7230)
('in', 5762)
('is', 4441)
('a', 4337)
('that', 4272)
('for', 2939)
('this', 2832)


In [78]:
parser2 = text_parser()
parser2.parse(['fr-en-en', 'de-en-en','sv-en-en'])
parser2.prob_words(['speaker','zebra'])

Probability of: speaker in ['fr-en-en', 'de-en-en', 'sv-en-en']is: 4.2331408750800126e-05
zebra is not used in: ['fr-en-en', 'de-en-en', 'sv-en-en']


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=27527141-1c2d-41eb-a7d6-3699f108d4e9' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>