In [3]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# Preprocess the text by removing punctuation and converting to lowercase. The text can be as long as wished.
text = "This is a sample text for keyword extraction."

In [5]:
# Tokenize the text into words
tokens = nltk.word_tokenize(text)
tokens

['This', 'is', 'a', 'sample', 'text', 'for', 'keyword', 'extraction', '.']

In [6]:
# pos_tag function is used to classify part-of-speech words (nouns:NN, verbs:VBZ, etc)
tags = nltk.pos_tag(tokens=tokens, lang='eng')
nouns = [word for (word, tag) in tags if tag == "NN"]

In [7]:
tags

[('This', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('sample', 'JJ'),
 ('text', 'NN'),
 ('for', 'IN'),
 ('keyword', 'NN'),
 ('extraction', 'NN'),
 ('.', '.')]

In [8]:
nouns

['text', 'keyword', 'extraction']

In [9]:
# Use term frequency-inverse document frequency (TF-IDF) analysis to rank the nouns
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform([text])

In [10]:
# importance of each word
vectorizer.vocabulary_

{'this': 6,
 'is': 2,
 'sample': 4,
 'text': 5,
 'for': 1,
 'keyword': 3,
 'extraction': 0}

In [11]:
# we suppose that the keywords are nouns, but also can be included verbs and adjectives
top_nouns = sorted(vectorizer.vocabulary_, reverse=True)
top_nouns = [word for word in top_nouns if word in nouns]

# keywords:
top_nouns

['text', 'keyword', 'extraction']