<a href="https://colab.research.google.com/github/Liza-IITP/Machine-Learning/blob/main/nlp_basics/Basics_NLTK_Tokenize_Stemming_Lemmatization_POStag_StopWords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('omw-1.4')


In [44]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

sentence = "Today morning, Arthur felt very good."

tokens = word_tokenize(sentence)

stop_words = set(stopwords.words('english'))

filtered_tokens = [
    word for word in tokens
    if word.lower() not in stop_words and word.isalpha()
]

print(filtered_tokens)


['Today', 'morning', 'Arthur', 'felt', 'good']


In [47]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
print([ps.stem(w) for w in filtered_tokens])

from nltk.stem import SnowballStemmer

ss = SnowballStemmer("english")
print([ss.stem(w) for w in filtered_tokens])

from nltk.stem import LancasterStemmer

ls = LancasterStemmer()
print([ls.stem(w) for w in filtered_tokens])


from nltk.stem import RegexpStemmer

regexp_stemmer = RegexpStemmer('ing$|ed$|s$', min=4)
print([regexp_stemmer.stem(w) for w in filtered_tokens])


from nltk import pos_tag

tagged = pos_tag(filtered_tokens)
print(tagged)




['today', 'morn', 'arthur', 'felt', 'good']
['today', 'morn', 'arthur', 'felt', 'good']
['today', 'morn', 'arth', 'felt', 'good']
['Today', 'morn', 'Arthur', 'felt', 'good']
[('Today', 'NN'), ('morning', 'NN'), ('Arthur', 'NNP'), ('felt', 'VBD'), ('good', 'JJ')]


In [46]:
from nltk.corpus import wordnet

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemmatized_words = [
    lemmatizer.lemmatize(word, get_wordnet_pos(tag))
    for word, tag in tagged
]

print(lemmatized_words)


['Today', 'morning', 'Arthur', 'felt', 'good']
