In [1]:

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.tag import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\najmulu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\najmulu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [2]:

porter_stemmer = PorterStemmer()  
snowball_stemmer = SnowballStemmer("english") 


my_text = "A beautiful waitress is running on a hilly road with his expensive running shoes"


words = word_tokenize(my_text)


print("=== Stemming with Porter and Snowball ===")
for w in words:
    print("Word:     ", w)
    print("Porter:   ", porter_stemmer.stem(w))       
    print("Snowball: ", snowball_stemmer.stem(w))       
    print()


=== Stemming with Porter and Snowball ===
Word:      A
Porter:    a
Snowball:  a

Word:      beautiful
Porter:    beauti
Snowball:  beauti

Word:      waitress
Porter:    waitress
Snowball:  waitress

Word:      is
Porter:    is
Snowball:  is

Word:      running
Porter:    run
Snowball:  run

Word:      on
Porter:    on
Snowball:  on

Word:      a
Porter:    a
Snowball:  a

Word:      hilly
Porter:    hilli
Snowball:  hilli

Word:      road
Porter:    road
Snowball:  road

Word:      with
Porter:    with
Snowball:  with

Word:      his
Porter:    hi
Snowball:  his

Word:      expensive
Porter:    expens
Snowball:  expens

Word:      running
Porter:    run
Snowball:  run

Word:      shoes
Porter:    shoe
Snowball:  shoe



In [3]:
tagged = pos_tag(words) 

print("=== POS Tagged Words ===")
print(tagged)


=== POS Tagged Words ===
[('A', 'DT'), ('beautiful', 'JJ'), ('waitress', 'NN'), ('is', 'VBZ'), ('running', 'VBG'), ('on', 'IN'), ('a', 'DT'), ('hilly', 'RB'), ('road', 'NN'), ('with', 'IN'), ('his', 'PRP$'), ('expensive', 'JJ'), ('running', 'NN'), ('shoes', 'NNS')]


In [4]:
nouns = list(filter(lambda x: x[1] == 'NN', tagged))
print("=== Nouns (NN) ===")
print(nouns)


=== Nouns (NN) ===
[('waitress', 'NN'), ('road', 'NN'), ('running', 'NN')]


In [5]:
nouns_adjectives = list(filter(lambda x: x[1] in ['NN', 'JJ'], tagged))
print("=== Nouns and Adjectives (NN, JJ) ===")
print(nouns_adjectives)


=== Nouns and Adjectives (NN, JJ) ===
[('beautiful', 'JJ'), ('waitress', 'NN'), ('road', 'NN'), ('expensive', 'JJ'), ('running', 'NN')]


In [6]:
with open("GAITXT.txt", "r", encoding="utf-8") as f:
    gaitxt_content = f.read()

gai_words = word_tokenize(gaitxt_content)
gai_tagged = pos_tag(gai_words)



In [7]:

filtered_no_dt = [word for word, tag in gai_tagged if tag != 'DT']

with open("no_determiner.txt", "w", encoding="utf-8") as f:
    f.write(" ".join(filtered_no_dt)) 

print("Saved no_determiner.txt (without DT)")


Saved no_determiner.txt (without DT)


In [8]:
verb_tags = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}
verbs = [word for word, tag in gai_tagged if tag in verb_tags]

with open("file_verbs.txt", "w", encoding="utf-8") as f:
    f.write(" ".join(verbs))  

print("Saved file_verbs.txt (verbs only)")


Saved file_verbs.txt (verbs only)
