
Given a file containing text. Complete using only default collections:

    1) Find 10 longest words consisting from largest amount of unique symbols
    
    2) Find rarest symbol for document
    
    3) Count every punctuation char
    
    4) Count every non ascii char
    
    5) Find most common non ascii char for document

In [84]:
from typing import List
from collections import Counter
import re

def get_longest_diverse_words(file_path: str) -> List[str]:
    text = open(file_path)
    longest_words_counter = Counter()
    return_list = []
    for line in text:
        longest_words_counter.update(re.split(r'[— .,!?:;-]', line))
    return sorted(longest_words_counter, key=len, reverse = True)[:10]

def get_rarest_char(file_path: str) -> str:
    text = open(file_path)
    letter_by_counter = Counter()
    
    for line in text:
        for character in line:
            letter_by_counter.update(character)
    return min(letter_by_counter, key=letter_by_counter.get)

def count_punctuation_chars(file_path: str) -> int:
    text = open(file_path)
    punctuation_by_counter = 0
    for line in text:
        for character in line:
            if character in ['.', ',', '!', '?', ':', ';', '-']:
                punctuation_by_counter += 1
    return punctuation_by_counter


def count_non_ascii_chars(file_path: str) -> int:
    text = open(file_path)
    non_ascii_chars_by_counter = 0
    for line in text:
        non_ascii_chars_by_counter += len(re.findall(r'\\u00..', line))
    return non_ascii_chars_by_counter

def get_most_common_non_ascii_char(file_path: str) -> str:
    text = open(file_path)
    non_ascii_char_by_counter = Counter()
    for line in text:
        non_ascii_char_by_counter.update(re.findall(r'\\u00..', line))
    return max(non_ascii_char_by_counter, key=non_ascii_char_by_counter.get)
    

In [5]:
text_str = "hw1_data.txt"

In [87]:
print("10 longest words are: ")
print(get_longest_diverse_words(text_str))

print("\nRarest char is: " + get_rarest_char(text_str))
print("Amount of punctuational chars is: " + str(count_punctuation_chars(text_str)))
print("Amount of non ascii chars is: " + str(count_non_ascii_chars(text_str)))
print("The most common non ascii char is: " + str(get_most_common_non_ascii_char(text_str)))

10 longest words are: 
['unmi\\u00dfverst\\u00e4ndliche', 'Wiederbelebungs\\u00fcbungen', '\\u00bbWaldg\\u00e4nger\\u00ab', 'Meinungs\\u00e4u\\u00dferung', 'Werkst\\u00e4ttenlandschaft', 'unver\\u00e4u\\u00dferlichen', '\\u00bbFreiheitswahl\\u00ab', 'vernunftgem\\u00e4\\u00dfe\n', 'Souver\\u00e4nit\\u00e4tsan', 'D\\u00e4monenschw\\u00e4rme']

Rarest char is: Y
Amount of punctuational chars is: 5300
Amount of non ascii chars is: 2888
The most common non ascii char is: \u00e4
