<a href="https://colab.research.google.com/github/Harivamsh2005/NLP/blob/main/NLP_T_07_08_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import spacy
from nltk.stem import PorterStemmer

# Step 1: Load data and clean text
df = pd.read_csv("bbc_news.csv")
texts = df['description']

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = text.strip()
    return text

texts_cleaned = texts.apply(clean_text)

# Step 2: Tokenize, remove stop words, top 10 frequent tokens
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')  # Download punkt_tab resource

stop_words = set(stopwords.words('english'))
tokens = []

for text in texts_cleaned:
    words = word_tokenize(text)
    words = [w for w in words if w not in stop_words]
    tokens.extend(words)

freq_tokens = Counter(tokens)
print("Top 10 frequent tokens:", freq_tokens.most_common(10))

# Step 3: Lemmatize, stem, top 10 frequent lemmas
try:
    nlp = spacy.load("en_core_web_sm")
except:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

stemmer = PorterStemmer()
lemmas = []
stems = []

for text in texts_cleaned:
    doc = nlp(text)
    for token in doc:
        if token.is_stop == False and token.is_alpha:
            lemmas.append(token.lemma_)
            stems.append(stemmer.stem(token.text))

freq_lemmas = Counter(lemmas)
print("Top 10 lemmas:", freq_lemmas.most_common(10))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Top 10 frequent tokens: [('says', 952), ('people', 481), ('new', 447), ('england', 444), ('first', 442), ('ukraine', 436), ('bbc', 411), ('uk', 387), ('say', 355), ('world', 350)]
Top 10 lemmas: [('say', 1104), ('england', 542), ('year', 512), ('people', 489), ('ukraine', 489), ('new', 449), ('win', 449), ('bbc', 411), ('uk', 387), ('world', 379)]
