# Task: Build a Mini Text-Cleaning Pipeline

### Goal: Given a short paragraph of raw text, write a little script that:

### Tokenizes the text into words.

### Removes stop-words (use NLTK’s English stop-word list).

### Applies stemming (Porter stemmer).

### Counts word frequencies in the cleaned text.

### Prints the top 5 most common stems and their counts.

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

# 1. (Optional) Download NLTK data
# nltk.download('punkt')
# nltk.download('stopwords')

text = """Your sample paragraph goes here: 
          e.g. “Natural Language Processing enables computers to understand human language.”"""


In [None]:
# 2. Tokenize
tokens = ...

In [None]:
# 3. Normalize to lowercase
tokens = [t.lower() for t in tokens]

In [None]:
# 4. Remove stop-words
stop_words = set(stopwords.words('english'))
filtered = ...

In [None]:
# 5. Stem
stemmer = PorterStemmer()
stems = ...


In [None]:
# 6. Count frequencies
freq = Counter(stems)

In [None]:
# 7. Print top 5
for stem, count in freq.most_common(5):
    print(f"{stem}: {count}")

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter

# 1. (Optional) Download NLTK data (uncomment these lines if running for the first time)
# nltk.download('punkt')
# nltk.download('stopwords')

text = """Natural Language Processing (NLP) enables computers to understand, interpret, 
and generate human language in a valuable way. It sits at the intersection of computer 
science, artificial intelligence, and linguistics."""

# 2. Tokenize
tokens = word_tokenize(text)

# 3. Normalize to lowercase
tokens = [t.lower() for t in tokens if t.isalpha()]

# 4. Remove stop-words
stop_words = set(stopwords.words('english'))
filtered = [t for t in tokens if t not in stop_words]

# 5. Stem
stemmer = PorterStemmer()
stems = [stemmer.stem(t) for t in filtered]

# 6. Count frequencies
freq = Counter(stems)

# 7. Print top 5
print("Top 5 stems:")
for stem, count in freq.most_common(5):
    print(f"{stem}: {count}")
