# Text Analizer

## Import libraries

In [1]:
import time
import spacy
import sqlite3
import lemminflect

In [2]:
NLP = spacy.load("en_core_web_sm", exclude = ['parser', 'ner'])

## Connect to database

In [3]:
DATABASE_FILENAME = 'word_cefr_minified.db'

conn = sqlite3.connect(DATABASE_FILENAME)
cursor = conn.cursor()

## Calculate functions and variables

In [4]:
ABBREVIATION_MAPPING = {
    "'m": "am",
    "'s": "is",
    "'re": "are",
    "'ve": "have",
    "'d": "had",
    "n't": "not",
    "'ll": "will"
}

DIFFICULTY_MAPPING_REVERSE = {
    1: 'A1',
    2: 'A2',
    3: 'B1',
    4: 'B2',
    5: 'C1',
    6: 'C2'
}


def is_punctuation(word: str) -> bool:
    return not word and not any(char.isalpha() for char in word)


def custom_tokenize_text(text: str) -> list[tuple[str, str, str]]:
    text = text.replace("’", "'")
    tokens = []
    doc = NLP(text)
    for token in doc:
        word = token.text.lower().strip()
        word_pos = token.tag_
        proposed_lemma = token._.lemma().lower()

        abbreviation_form = ABBREVIATION_MAPPING.get(word)
        if abbreviation_form:
            word = abbreviation_form
            lemma = word
        elif proposed_lemma is None:
            lemma = word.lower()
        else:
            lemma = proposed_lemma

        tokens.append((word, lemma, word_pos))

    return tokens


def fetch_word_pos_level_tokens(word_pos_tokens_set: set[tuple[str, str]]) -> dict[tuple[str, str], float]:
    placeholders = ','.join(['(?, ?)' for _ in range(len(word_pos_tokens_set))])

    cursor.execute('''
        WITH word_pos_tags(word, pos_tag) AS (
            VALUES {}
        )
        SELECT
            word_pos_tags.word,
            word_pos_tags.pos_tag,
            COALESCE(
                AVG(CASE WHEN pt.tag = word_pos_tags.pos_tag THEN wp.level END),
                AVG(wp.level)
            ) AS avg_level
        FROM word_pos_tags
        JOIN words w ON word_pos_tags.word = w.word
        JOIN word_pos wp ON w.word_id = wp.word_id
        JOIN pos_tags pt ON wp.pos_tag_id = pt.tag_id
        GROUP BY word_pos_tags.word, word_pos_tags.pos_tag
    '''.format(placeholders), [item for sublist in word_pos_tokens_set for item in sublist])

    word_pos_level_tokens = cursor.fetchall()

    return {(word, pos_tag): float(avg_level) for word, pos_tag, avg_level in word_pos_level_tokens}


def get_word_pos_tokens_set(tokens: list[tuple[str, str, str]]) -> set[tuple[str, str]]:
    return {(token[0], token[2]) for token in tokens if not is_punctuation(token[1])}


def get_levels_tokens(tokens: list[tuple[str, str, str]]) -> list[tuple[str, str, str, float]]:
    word_pos_set = get_word_pos_tokens_set(tokens)
    word_pos_unique_level_tokens = fetch_word_pos_level_tokens(word_pos_set)

    word_pos_level_tokens = []
    for token in tokens:
        word, lemma, word_pos = token

        level = word_pos_unique_level_tokens.get((word, word_pos))
        if level is None:
            level = 0

        word_pos_level_tokens.append((word, lemma, word_pos, level))

    return word_pos_level_tokens


def get_word_level_count_statistic(level_tokens: list[tuple[str, str, str, float]]) -> list[int]:
    difficulty_levels_count = [0] * 6
    for token in level_tokens:
        level = round(token[3])
        if level:
            difficulty_levels_count[level - 1] += 1

    return difficulty_levels_count


def get_word_level_count_statistic_unique(level_tokens: list[tuple[str, str, str, float]]) -> list[int]:
    processed_word_pos_set = set()
    difficulty_levels_count = [0] * 6
    for token in level_tokens:
        level = round(token[3])
        to_check_tuple = (token[0], token[2])
        if level and not to_check_tuple in processed_word_pos_set:
            processed_word_pos_set.add(to_check_tuple)
            difficulty_levels_count[level - 1] += 1

    return difficulty_levels_count


def get_not_found_words(level_tokens: list[tuple[str, str, str, float]]) -> set[str]:
    not_found_words = set()
    for token in level_tokens:
        if not token[3] and token[0] and all(char.isalpha() for char in token[0]):
            not_found_words.add(token[0])

    return not_found_words


def filter_for_desired_level(level_tokens: list[tuple[str, str, str, float]],
                            min_level: float, max_level: float = 6) -> set[tuple[str, str, str, float]]:
    filtered_tokens = set()
    for token in level_tokens:
        level = token[3]
        if level >= min_level and level <= max_level:
            filtered_tokens.add(token)

    return filtered_tokens


In [5]:
# Source: ChatGPT 3.5
input_text = """
In the heart of every forest, a hidden world thrives among the towering trees. Trees, 
those silent giants, are more than just passive observers of nature's drama; they are 
active participants in an intricate dance of life.

Did you know that trees communicate with each other? It's not through words or gestures 
like ours, but rather through a complex network of fungi that connect their roots 
underground. This network, often called the "wood wide web," allows trees to share 
nutrients, water, and even warnings about potential threats.

But trees are not just generous benefactors; they are also masters of adaptation. Take 
the mighty sequoias, for example, towering giants that have stood the test of time for 
thousands of years. These giants have evolved thick, fire-resistant bark to withstand 
the frequent wildfires of their native California.

And speaking of longevity, did you know that some trees have been around for centuries, 
witnessing history unfold? The ancient bristlecone pines of the American West, for 
instance, can live for over 5,000 years, making them some of the oldest living organisms 
on Earth.

So the next time you find yourself wandering through a forest, take a moment to appreciate 
the remarkable world of trees. They may seem like silent spectators, but their lives are 
full of fascinating stories waiting to be discovered.
"""

In [6]:
start_time_nlp = time.time()
tokens = custom_tokenize_text(input_text)

print("NLP:", round((time.time() - start_time_nlp) * 1000), "ms")

start_time_cefr = time.time()
level_tokens = get_levels_tokens(tokens)
print("CEFR levels:", round((time.time() - start_time_cefr) * 1000), "ms")

print('-' * 30)

print("Text length:", len(input_text))
print("Total tokens:", len(tokens))

NLP: 318 ms
CEFR levels: 3 ms
------------------------------
Text length: 1370
Total tokens: 275


In [7]:
counter = 0
print(f'{"WORD".ljust(26)}\t{"LEMMA".ljust(26)}\tPOS\tLEVEL\tCEFR')
print('-' * 85)
for token in level_tokens:
    word, lemma, pos, level = token
    if pos != '_SP':
        print(f'{word.ljust(26)}\t{lemma.ljust(26)}\t{pos}\t{"{:.2f}".format(level)}\t{DIFFICULTY_MAPPING_REVERSE.get(round(level))}')

        counter += 1
        if counter >= 200:
            break

WORD                      	LEMMA                     	POS	LEVEL	CEFR
-------------------------------------------------------------------------------------
in                        	in                        	IN	1.00	A1
the                       	the                       	DT	1.00	A1
heart                     	heart                     	NN	1.00	A1
of                        	of                        	IN	1.00	A1
every                     	every                     	DT	1.00	A1
forest                    	forest                    	NN	2.00	A2
,                         	,                         	,	0.00	None
a                         	a                         	DT	1.00	A1
hidden                    	hidden                    	JJ	3.00	B1
world                     	world                     	NN	1.00	A1
thrives                   	thrive                    	VBZ	5.86	C2
among                     	among                     	IN	2.00	A2
the                       	the                       	DT	1.00	A

In [8]:
difficulty_levels_count = get_word_level_count_statistic(level_tokens)

print('CEFR statistic (total words):')
for i in range(1, 7):
    print(f'{DIFFICULTY_MAPPING_REVERSE.get(i)}: {difficulty_levels_count[i - 1]}')

CEFR statistic (total words):
A1: 136
A2: 37
B1: 27
B2: 11
C1: 2
C2: 7


In [9]:
difficulty_levels_count_unique = get_word_level_count_statistic_unique(level_tokens)

print('CEFR statistic (unique words):')
for i in range(1, 7):
    print(f'{DIFFICULTY_MAPPING_REVERSE.get(i)}: {difficulty_levels_count_unique[i - 1]}')

CEFR statistic (unique words):
A1: 69
A2: 34
B1: 23
B2: 11
C1: 2
C2: 7


In [10]:
not_found_words_set = get_not_found_words(level_tokens)

not_found_words_list = list(not_found_words_set)
not_found_words_list.sort()

print('Not found words:', len(not_found_words_list))

if len(not_found_words_list):
    print('\n'.join(not_found_words_list))

Not found words: 0


In [11]:
desired_level_words_set = filter_for_desired_level(level_tokens, 4)

desired_level_words_list = list(desired_level_words_set)
desired_level_words_list.sort(key=lambda x: (x[2], x[0]))

print('\tWords with level B2 and higher:', len(desired_level_words_list))
for word_data in desired_level_words_list:
    word, _, pos, level = word_data
    print(word.lower().ljust(26), pos.ljust(6), "{:.2f}".format(level).ljust(6), DIFFICULTY_MAPPING_REVERSE.get(round(level)))

	Words with level B2 and higher: 17
mighty                     JJ     4.00   B2
potential                  JJ     4.00   B2
bristlecone                NN     6.00   C2
living                     NN     4.00   B2
longevity                  NN     5.97   C2
california                 NNP    6.00   C2
benefactors                NNS    6.00   C2
fungi                      NNS    5.19   C1
masters                    NNS    4.00   B2
observers                  NNS    4.00   B2
pines                      NNS    4.00   B2
sequoias                   NNS    6.00   C2
wildfires                  NNS    6.00   C2
underground                RB     4.00   B2
withstand                  VB     5.12   C1
evolved                    VBN    4.00   B2
thrives                    VBZ    5.86   C2


In [12]:
conn.close()