Import the necessary libraries.

In [48]:
# Import libraries for keywords
from itertools import combinations as _combinations
from queue import Queue

from summa.pagerank_weighted import pagerank_weighted_scipy as _pagerank
from summa.preprocessing.textcleaner import clean_text_by_word as _clean_text_by_word
from summa.preprocessing.textcleaner import tokenize_by_word as _tokenize_by_word
from summa.commons import build_graph as _build_graph
from summa.commons import remove_unreachable_nodes as _remove_unreachable_nodes

from create_input_lists import lemmatize
from collections import Counter

from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from collections import defaultdict

import re

# Added on 5/19/2021
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from collections import defaultdict
from nltk.stem import WordNetLemmatizer

Define the necessary functions to support TextRank.

In [61]:
WINDOW_SIZE = 2

"""Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters
Example: filter for nouns and adjectives:
INCLUDING_FILTER = ['NN', 'JJ']"""
INCLUDING_FILTER = ['NN', 'JJ']
EXCLUDING_FILTER = []

# lemmatize a word

def _get_pos_filters():
    return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER)


def _get_words_for_graph(tokens):
    include_filters, exclude_filters = _get_pos_filters()
    # print(include_filters)
    # print(exclude_filters)
    if include_filters and exclude_filters:
        raise ValueError("Can't use both include and exclude filters, should use only one")

    result = []
    for word, unit in tokens.items():
        if exclude_filters and unit.tag in exclude_filters:
            # print('Yes')
            continue
        # print('Unit Tag:', unit.tag, '\n')
        # print('unit.tag in include_filters?:', unit.tag in include_filters, '\n')
        # print(unit.token, unit.tag)
        if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag:
            # if unit.tag in include_filters:
            # print('Here we are.')
            if unit.token not in result:
                result.append(unit.token)
    print('Vertices in Implementation:', result, '\n')
    return result


def _get_first_window(split_text):
    return split_text[:WINDOW_SIZE]


def _set_graph_edge(graph, tokens, word_a, word_b):
    # print('Word A:', word_a, '\n')
    # print('Word B:', word_b, '\n')
    if word_a in tokens and word_b in tokens:
        # print('YES')
        lemma_a = tokens[word_a].token
        lemma_b = tokens[word_b].token
        edge = (lemma_a, lemma_b)

        if graph.has_node(lemma_a) and graph.has_node(lemma_b) and not graph.has_edge(edge):
            print('Edge added in Implementation:', edge, '\n')
            graph.add_edge(edge)


def _process_first_window(graph, tokens, split_text):
    first_window = _get_first_window(split_text)
    # print(_combinations(first_window, 2))
    # print('First Window:', first_window, '\n')
    for word_a, word_b in _combinations(first_window, 2):
        # print('Word A:', word_a, '\n')
        # print('Word B:', word_b, '\n')
        _set_graph_edge(graph, tokens, word_a, word_b)


def _init_queue(split_text):
    queue = Queue()
    first_window = _get_first_window(split_text)
    for word in first_window[1:]:
        queue.put(word)
    return queue


def _process_word(graph, tokens, queue, word):
    for word_to_compare in _queue_iterator(queue):
        _set_graph_edge(graph, tokens, word, word_to_compare)


def _update_queue(queue, word):
    queue.get()
    queue.put(word)
    assert queue.qsize() == (WINDOW_SIZE - 1)


def _process_text(graph, tokens, split_text):
    queue = _init_queue(split_text)
    for i in range(WINDOW_SIZE, len(split_text)):
        word = split_text[i]
        _process_word(graph, tokens, queue, word)
        _update_queue(queue, word)


def _queue_iterator(queue):
    iterations = queue.qsize()
    for i in range(iterations):
        var = queue.get()
        yield var
        queue.put(var)


def _set_graph_edges(graph, tokens, split_text):
    _process_first_window(graph, tokens, split_text)
    _process_text(graph, tokens, split_text)


def _extract_tokens(lemmas, scores, ratio, words):
    lemmas.sort(key=lambda s: scores[s], reverse=True)
    # print('Inside extract_tokens, after sorting lemmas:', lemmas, '\n')

    # If no "words" option is selected, the number of sentences is
    # reduced by the provided ratio, else, the ratio is ignored.
    length = len(lemmas) * ratio if words is None else words
    if int(length) == 0:
        length = 1
    # print('Inside extract_tokens, length of reduction:', length, '\n')
    return [(scores[lemmas[i]], lemmas[i],) for i in range(int(length))]


def _lemmas_to_words(tokens):
    lemma_to_word = {}
    for word, unit in tokens.items():
        lemma = unit.token
        if lemma in lemma_to_word:
            lemma_to_word[lemma].append(word)
        else:
            lemma_to_word[lemma] = [word]
    return lemma_to_word


# modified 5/19/2021: removed translation to original text
def _get_keywords_with_score(extracted_lemmas):
    """
    :param extracted_lemmas:list of tuples
    :return: dict of {keyword:score}
    """
    keywords = {}
    for score, lemma in extracted_lemmas:
        keywords[lemma] = score
    return keywords


def _strip_word(word):
    stripped_word_list = list(_tokenize_by_word(word))
    return stripped_word_list[0] if stripped_word_list else ""


def _get_combined_keywords(_keywords, tokens, split_text):
    """
    :param _keywords:dict of keywords:scores
    :param split_text: list of strings
    :return: combined_keywords:list
    """
    result = []
    _keywords = _keywords.copy()
    len_text = len(split_text)
    for i in range(len_text):
        word = _strip_word(split_text[i])
        if word in tokens:
            word = tokens[word].token
        if word in _keywords:
            # print(word)
            combined_word = [word]
            if i + 1 == len_text:
                result.append(word)   # appends last word if keyword and doesn't iterate
                print('Result: ', result)
            for j in range(i + 1, len_text):
                other_word = _strip_word(split_text[j])
                if other_word in tokens:
                    other_word = tokens[other_word].token
                # print(split_text[j])
                if other_word in _keywords and other_word not in combined_word:
                    combined_word.append(other_word)
                else:
                    print(';;;', combined_word)
                    for keyword in combined_word:
                        _keywords.pop(keyword)
                    result.append(" ".join(combined_word))
                    print(_keywords)
                    print('result', result)
                    break
    return result


def _get_average_score(concept, _keywords):
    word_list = concept.split()
    word_counter = 0
    total = 0
    for word in word_list:
        total += _keywords[word]
        word_counter += 1
    print('\'{}\': {}'.format(concept, total / word_counter))
    return total / word_counter


def _format_results(_keywords, combined_keywords, split, scores):
    """
    :param _keywords:dict of keywords:scores
    :param combined_keywords:list of word/s
    """
    combined_keywords.sort(key=lambda w: _get_average_score(w, _keywords), reverse=True)
    if scores:
        return [(word, _get_average_score(word, _keywords)) for word in combined_keywords]
    if split:
        return combined_keywords
    return "\n".join(combined_keywords)


def keywords(text, ratio=1/3, words=None, language="english", split=False, scores=False, deaccent=False, additional_stopwords=None):
    if not isinstance(text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")

    # Gets a dict of word -> lemma
    tokens = _clean_text_by_word(text, language, deacc=deaccent, additional_stopwords=additional_stopwords)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)
    del split_text # It's no longer used

    print('Number of nodes in graph in implementation:', len(graph.nodes()), '\n')
    _remove_unreachable_nodes(graph)
    print('Number of nodes in graph after removing unreachables in implementation:', len(graph.nodes()), '\n')
    
    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return [] if split else ""

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)
    print('PageRank scores from implementation:', pagerank_scores, '\n')
    
    # words = len(graph.nodes())
    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)
    print('Extracted lemmas:', extracted_lemmas, '\n')

    # lemmas_to_word = _lemmas_to_words(tokens)
    keywords = _get_keywords_with_score(extracted_lemmas)
    print('Final Keywords:', keywords, '\n')

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    print('Finding Combined Keywords.\n')
    combined_keywords = _get_combined_keywords(keywords, tokens, text.split())
    print('Final Combined Keywords:', combined_keywords, '\n')
    print('Final Keywords: ', keywords, '\n')
    
    final_results = _format_results(keywords, combined_keywords, split, scores)
    print('Final Results:', final_results, '\n')

    return final_results


def get_graph(text, language="english", deaccent=False):
    tokens = _clean_text_by_word(text, language, deacc=deaccent)
    split_text = list(_tokenize_by_word(text, deacc=deaccent))

    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)

    return graph

In [62]:
def simple_preprocessText(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z-\s]','', text)
    return text

In [66]:
# assign the text from the 'equal concentration' cluster
# clusteredText = 'A solution has an equal concentration of H+ and OH- _ This solution is probably: neutral Correct _ Neutral substances have equal concentrations of H+ and OH- _ Acids have a higher concentration of H+ than OH- _ Bases have a lower concentration of H+ than OH- _ Neutral substances have equal concentrations of H+ and OH-.. A solution has a higher concentration of H+ than OH- _ This solution is probably: acidic Correct _ Acids have a higher concentration of H+ than OH- _ Acids have a higher concentration of H+ than OH- _ Bases have a lower concentration of H+ than OH- _ Neutral substances have equal concentrations of H+ and OH-.. A solution has a lower concentration of H+ than OH- _ This solution is probably: basic Correct _ Bases have a lower concentration of H+ than OH- _ Acids have a higher concentration of H+ than OH- _ Bases have a lower concentration of H+ than OH- _ Neutral substances have equal concentrations of H+ and OH-..'

# assign the text from the 'chemical formula' cluster
# clusteredText = 'What is the chemical formula for copper(I) carbonate? Cu2CO3 Correct _ The 2:1 ratio of Cu+ to CO32- results in a neutral compound _ The (I) beside the copper means that the ion in the compound is Cu1+ _ .  Enter the chemical formula for copper(II) iodide _ Correct _ The ions are Cu2+ and I-.. Which of the following statements is true of the chemical equation below? $$\mathrm{Cu}(s)\;+\;2\;{\mathrm{AgNO}}_3(aq)\;\rightarrow\;2\;\mathrm{Ag}(s)\;+\;\mathrm{Cu}({\mathrm{NO}}_3)_2(aq)$$ There are 2 atoms of silver represented on the product side of the equation _ Correct _ The coefficient of 2 to the left of the Ag can be thought of as 2 individual atoms of silver..  Copper undergoes oxidation when placed in a solution of silver nitrate _  If 6.2 g of copper is placed into 50.0 mL of a 2.5 M AgNO3 solution, which is the limiting reactant? $$\mathrm{Cu}(s)\;+\;2\;{\mathrm{AgNO}}_3(aq)\;\rightarrow\;\mathrm{Cu}({\mathrm{NO}}_3)_2(aq)\;+\;2\;\mathrm{Ag}(s)$$ AgNO3 Correct _ $$\require{cancel}\begin{array}{l}6.2\cancel{\;\mathrm g\;\mathrm{Cu}\;}\times\;\frac{1\;\cancel{\mathrm{mol}\;\mathrm{Cu}}}{63.55\;\cancel{\mathrm g\;\mathrm{Cu}}}\;\times\;\frac{2\;\mathrm{mol}\;\mathrm{Ag}}{1\;\cancel{\mathrm{mol}\;\mathrm{Cu}}}\;=\;0.195\;\mathrm{mol}\;\mathrm{Ag}\\\\0.05\;\mathrm L\;\times\;2.5\;\mathrm M\;=\;0.125\;\mathrm{mol}\;{\mathrm{AgNO}}_3\\\\0.125\;\cancel{\mathrm{mol}\;{\mathrm{AgNO}}_3}\;\times\;\frac{2\;\mathrm{mol}\;\mathrm{Ag}}{2\;\cancel{\mathrm{mol}\;{\mathrm{AgNO}}_3}}\;=\;0.125\;\mathrm{mol}\;\mathrm{Ag}\\\\{\mathrm{AgNO}}_3\;\mathrm{produces}\;\mathrm{the}\;\mathrm{lesser}\;\mathrm{amount}\;\mathrm{of}\;\mathrm{product}\;\mathrm{therefore}\;\mathrm{it}\;\mathrm{is}\;\mathrm{the}\;\mathrm{limiting}\;\mathrm{reactant}.\end{array}$$ . What is the concentration of Cu(NO3)2 when the reaction is complete? (Assume the change in volume from the added Cu(s) is negligible and can be ignored.) M Correct _ $$\require{cancel}\begin{array}{l}0.125\cancel{\;\mathrm{mol}\;{\mathrm{AgNO}}_3}\;\times\;\frac{1\;\mathrm{mol}\;\mathrm{Cu}({\mathrm{NO}}_3)_2}{2\;\cancel{\mathrm{mol}\;{\mathrm{AgNO}}_3}}\;=\;0.0625\;\mathrm{mol}\;\mathrm{Cu}({\mathrm{NO}}_3)_2\\\\\frac{0.0625\;\mathrm{mol}\;}{0.0500\;\mathrm L}\;=\;1.25\;\mathrm M\end{array}$$ .'

# assign the text from the 'atom' cluster
# clusteredText = 'Atoms that make it up are billions of years old: _ Nutrient (matter) Correct _ It may be hard to imagine, but your hydrogen atoms are probably about 13 billion years old . _ as old as the universe itself! And your carbon atoms, which formed in the life cycle of a more ancient star, are older than our solar system..'

# assign the text from the 'atom_2' cluster
# clusteredText = 'What is the charge of a neutron? Neutrons have a neutral charge _ Yes, that\'s correct _ Neutrons have a neutral charge _ The word “neutron” sounds like the word “neutral.". What is the atomic mass of sodium (round this number to the nearest whole number)? 23 Correct _ This is the atomic mass for Na.. How many neutrons does sodium usually have? 12 Correct _ The atomic mass (23) minus the number of protons (11) is the number of neutrons.. An element has 43 protons and 50 neutrons _ What is the ATOMIC MASS of this element? 93 Correct _ The sum of the protons and the neutrons is the atomic mass.. Fluorine, F, has an atomic number of 9 and an approximate atomic mass of 19 _ How many neutrons does a fluorine atom have? 10 Correct _ This is the number of neutrons _ The number of neutrons = the atomic mass - the atomic number. Beryllium, Be, has an atomic number of 4 and an atomic mass of approximately 9 _ How many protons does a beryllium atom have? 4 Correct! The atomic number is the number of protons.. Beryllium, Be, has an atomic number of 4 and an atomic mass of approximately 9 _ How many neutrons does a beryllium atom have? 5 Correct _ This is the number of neutrons _ The number of neutrons = the atomic mass - the atomic number.. Beryllium, Be, has an atomic number of 4 and an atomic mass of approximately 9 _ How many electrons does a beryllium atom have? 4 Correct! For an electrically neutral atom, the number of protons = the number of electrons. A mystery atom contains 7 protons, 7 electrons, and 7 neutrons _ What is its mass? What is its charge? Refer to the periodic table: What element is this? 14 0 Oxygen Correct The mass consists of the number of protons added to the number of neutrons The proton has a +1, the neutron has a zero chage, and the electron has a -1 charge _ Make sure to use the answer to the mass question to help you identify the correct atom..'

# assign the text from the 'atom_3' cluster
# clusteredText = 'Is the following statement true or false? "The outer shell of this atom is full." False Correct _ The outer shell (third shell in this case) can hold up to eight electrons, but our atom only has one in its outer shell.. If lithium gives up its third electron, the first shell of the atom is now the outer shell _ This outer shell is full _ What is the charge of this atom? +1 Correct _ This atom has three protons (+3) and two electrons (-2) _ [+3 - 2 = +1] If lithium gives up the electron in its outer shell, it now has two electrons and three protons _ Protons have a positive charge and electrons have a negative charge.. Fluorine, F, has an atomic number of 9 and an approximate atomic mass of 19 _ What would be the charge of fluorine that has a full outer shell? -1 Correct! Fluorine would gain an electron to fill the outer shell _ . What would be the charge of a neon atom that has a full outer shell? no charge Correct! Neon has a full outer shell in its neutral state..  What would be the charge of a beryllium atom that has a full outer shell? +2 Correct! Beryllium would lose the 2 electrons in the outer shell.. Lithium has an atomic number of 3 _ Is lithium likely to interact with other atoms? Yes, because it has one electron in the outer shell _ Correct _ The outer shell (second shell in this case) is not full, so this atom is reactive.. How many covalent bonds can oxygen form to fill its outer shell? 2 Correct _ Oxygen has two vacancies in its outer shell _ It can fill these vacancies by forming two covalent bonds _ Each covalent bond fills one vacancy _ Oxygen has six electrons in its outer shell; a full shell holds eight electrons, so there are two vacancies _ Oxygen participates in two covalent bonds to fill its outer shell.. How many covalent bonds does a carbon need to fill its outer shell? 4 Correct _ Carbon has four electrons in its outer shell and needs four more to have a full shell _ Each needed electron can form one covalent bond _ In the diagram a line between two atoms indicates that they formed a chemical bond, which means the two atoms are sharing an electron _ An atom will form as many covalent bonds as are necessary for it to fill its outer shell _ Carbon has four electrons in its outer shell _ A full outer shell contains eight electrons _ Carbon needs four more electrons to have a full outer shell.. How many covalent bonds does each oxygen need to fill its outer shell? 2 Correct _ Oxygen has six electrons in its outer shell, so it needs two more to have a full shell _ Each needed electron can form one covalent bond _ In the diagram, a line between two atoms indicates that they formed a chemical bond, which means the two atoms are sharing an electron _ An atom will form as many covalent bonds as are necessary for it to fill its outer shell _ Oxygen has six electrons in its outer shell _ A full outer shell contains eight electrons _ Oxygen needs two more electrons to have a full outer shell..  How many covalent bonds does each hydrogen need to fill its outer shell? 1 Correct _ Hydrogen has one electron in its outer shell, so it only needs one more to have a full shell _ Each needed electron can form one covalent bond _ In the diagram, a line between two atoms indicates that they formed a chemical bond, which means the two atoms are sharing an electron _ An atom will form as many covalent bonds as are necessary for it to fill its outer shell _ Hydrogen has one electron in its outer shell _ For hydrogen, a full outer shell contains two electrons _ Hydrogen needs one more electron to have a full outer shell.. What determines the number of covalent bonds formed by an atom? Electrons in the outer valence shell of the atom Correct _ An atom attempts to fill its outer valence shell..  Which is the correct formula for the smallest possible hydrocarbon? CH4 Correct, the smallest hydrocarbon is methane! 4 unpaired electrons on the outer shell of carbon will each pair up with an electron from a hyrdogen atom to form 4 covalent bonds _ Look at the periodic table to determine how many covalent bonds carbon can form _ The number of covalent bonds depends on the number of unpaired electrons on its outer shell.. What determines the number of covalent bonds formed by an atom? Electrons in the outer valence shell in the atom _ Correct _ An atom attempts to fill its outer valence shell _ Look at the outer electron shell in the Periodic table and the Atomic Properties table..'

# assign the text from the 'atom_4' cluster
# clusteredText = 'An atom is converted into an ion by adding or removing _ _ electrons Correct _ When electrons are added or removed from an atom, the charge of the atom changes and the atom becomes an ion.. Which is the biggest? Atom Correct _ The others are components of an atom..  Atom 1 and atom 2 form a covalent bond _ Atom 1 tends to draw the shared electrons closer to it _ Which of the following statements is true? Atom 1 is more electronegative than atom 2 _ Correct _ Because atom 1 pulls the shared electrons closer to it, it is the more electronegative atom _ The more electronegative atom will attract the electrons more _ Which atom attracts the electrons more?. The covalent bond is likely to be polar when one of the atoms sharing electrons is much more electronegative than the other atom Correct..'

# assign the text from the 'atom_5' cluster
# clusteredText = 'Nitrogen (N) is much more electronegative than hydrogen (H) _ Which of the following statements is correct about ammonia (NH3)? Each hydrogen atom has a slight positive charge _ Correct _ The nitrogen atom draws electrons to it and the hydrogen becomes partially positive _ Which atom attracts electrons more strongly? Electrons carry a negative charge _ In this molecule, the electronegative N atom will have a partial negative charge and the H atoms will have a partial positive charge.. Nitrogen (N) is much more electronegative than hydrogen (H) _ Which of the following statements is CORRECT about ammonia (NH3)? Each hydrogen atom has a partial positive charge _ Correct _ The nitrogen atom draws electrons to it and the hydrogen becomes partially positive _ Look through the content of the page again.. Nitrogen (N) is much more electronegative than hydrogen (H) _ Which of the following statements is CORRECT about ammonia (NH3)? Each hydrogen atom has a partial positive charge Correct..'

# assign the text from the 'atom_6' cluster
# clusteredText = 'Hydrogen bonding is responsible for each of the following except the force holding hydrocarbons together _ Correct _ Hydrogen bonding only occurs when the hydrogen atom is attached to an electronegative atom _ Carbon is not electronegative _ The force holding hydrocarbons together is called the hydrophobic effect _ Look through the content of the page again.. Hydrogen bonding is responsible for each of the following except the force directly holding hydrocarbons together Correct..'

# assign the text from the 'atom_7' cluster
clusteredText = 'The structure of two amino acids is shown, with the side-chain atoms (R group) highlighted in yellow _ Which of these amino acids has a polar (hydrophilic) side chain, such that it would like to interact with water? serine Correct _ Its side chain OH group can interact with water, via hydrogen bonding _ Side-chain atoms that are more similar to water (H2O) are more likely to interact with water.. The amino acid valine is shown with its sidechain atoms highlighted in yellow _ Do you think this amino acid is hydrophilic or hydrophobic? hydrophobic (non-polar) Correct! There are no oxygen atoms here, so the side chain does NOT look like water _ Therefore, it probably does not interact well with water!.'

clusteredText = simple_preprocessText(clusteredText)
print('Original Text:', clusteredText, '\n')

custom_stopwords = ['correct', 'true', 'false', 'yes', 'following', 'students', 'student']

lemmatized_clusteredText = lemmatize(clusteredText)
print('Lemmatized Text:', lemmatized_clusteredText, '\n')

tag_map = defaultdict(lambda: wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

tokens = list(word_tokenize(lemmatized_clusteredText))

print('Begin TextRank Algorithm. \n')
skill_labels = keywords(clusteredText, additional_stopwords = custom_stopwords)
print('End TextRank Algorithm. \n')
print('Final Keywords', skill_labels)

top_skill_label = skill_labels.split('\n')[0]
print('Top Skill Label Before Checking For Failure:', top_skill_label, '\n')

if len(top_skill_label) == 0:
    print('TextRank did not generate a keyword for the cluster.\n')
    # modified on 5/19/2021: changed to select most frequent lemma instead of most frequent original word
    words_freq = Counter(lemmatized_clusteredText.split())
    maxFreq = 0
    for word, freq in words_freq.items():
        if freq > maxFreq:
            top_skill_label = word
            maxFreq = freq
            
print('Top Skill Label After Checking For Failure:', top_skill_label)

Original Text: the structure of two amino acids is shown with the side-chain atoms r group highlighted in yellow  which of these amino acids has a polar hydrophilic side chain such that it would like to interact with water serine correct  its side chain oh group can interact with water via hydrogen bonding  side-chain atoms that are more similar to water ho are more likely to interact with water the amino acid valine is shown with its sidechain atoms highlighted in yellow  do you think this amino acid is hydrophilic or hydrophobic hydrophobic non-polar correct there are no oxygen atoms here so the side chain does not look like water  therefore it probably does not interact well with water 

Lemmatized Text: structure amino acid show chain atom r group highlight yellow amino acid polar hydrophilic chain like interact water serine chain oh group interact water hydrogen bonding chain atom similar water ho likely interact water amino acid valine show sidechain atom highlight yellow think a

In [52]:
def lemmatize_sentence(sentence):
    tag_map = defaultdict(lambda: wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV

    tokens = list(word_tokenize(sentence))
    lemma_function = WordNetLemmatizer()

    word_lemmas = []
    for token, tag in pos_tag(tokens):
        print(token, tag_map[tag[0]])
        lemma = lemma_function.lemmatize(token, tag_map[tag[0]])
        word_lemmas.append(lemma)

    return " ".join(word_lemmas)

lemmatize_sentence('equals')

equals n


'equal'