In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import pandas as pd

In [3]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91705\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\91705\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91705\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91705\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\91705\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [15]:
documents = [
    "Natural Language Processing allows machines to understand human language.",
    "Text preprocessing includes tokenization, stemming and lemmatization.",
    "TF-IDF helps in finding important words in a document."
]

In [16]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [17]:
for i, doc in enumerate(documents):
    print(f"\n--- Document {i+1} ---")
    
    # Tokenization
    tokens = word_tokenize(doc)
    print("Tokens:", tokens)
    
    # POS Tagging
    pos_tags = pos_tag(tokens)
    print("POS Tags:", pos_tags)
    
    # Stop word removal & alphabet check
    filtered = [word for word in tokens if word.isalpha() and word.lower() not in stop_words]
    print("After Stop-word Removal:", filtered)
    
    # Stemming
    stemmed = [stemmer.stem(word) for word in filtered]
    print("After Stemming:", stemmed)
    
    # Lemmatization
    lemmatized = [lemmatizer.lemmatize(word.lower()) for word in filtered]
    print("After Lemmatization:", lemmatized)


--- Document 1 ---
Tokens: ['Natural', 'Language', 'Processing', 'allows', 'machines', 'to', 'understand', 'human', 'language', '.']
POS Tags: [('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('allows', 'VBZ'), ('machines', 'NNS'), ('to', 'TO'), ('understand', 'VB'), ('human', 'JJ'), ('language', 'NN'), ('.', '.')]
After Stop-word Removal: ['Natural', 'Language', 'Processing', 'allows', 'machines', 'understand', 'human', 'language']
After Stemming: ['natur', 'languag', 'process', 'allow', 'machin', 'understand', 'human', 'languag']
After Lemmatization: ['natural', 'language', 'processing', 'allows', 'machine', 'understand', 'human', 'language']

--- Document 2 ---
Tokens: ['Text', 'preprocessing', 'includes', 'tokenization', ',', 'stemming', 'and', 'lemmatization', '.']
POS Tags: [('Text', 'NNP'), ('preprocessing', 'VBG'), ('includes', 'VBZ'), ('tokenization', 'NN'), (',', ','), ('stemming', 'VBG'), ('and', 'CC'), ('lemmatization', 'NN'), ('.', '.')]
After Stop-word Rem

In [18]:
print("\n--- Term Frequency (TF) ---")
vectorizer = CountVectorizer(stop_words='english')
tf_matrix = vectorizer.fit_transform(documents)
tf_df = pd.DataFrame(tf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(tf_df)



--- Term Frequency (TF) ---
   allows  document  finding  helps  human  idf  important  includes  \
0       1         0        0      0      1    0          0         0   
1       0         0        0      0      0    0          0         1   
2       0         1        1      1      0    1          1         0   

   language  lemmatization  machines  natural  preprocessing  processing  \
0         2              0         1        1              0           1   
1         0              1         0        0              1           0   
2         0              0         0        0              0           0   

   stemming  text  tf  tokenization  understand  words  
0         0     0   0             0           1      0  
1         1     1   0             1           0      0  
2         0     0   1             0           0      1  


In [19]:
print("\n--- Inverse Document Frequency (IDF) and TF-IDF ---")
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(tf_matrix)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print(tfidf_df)


--- Inverse Document Frequency (IDF) and TF-IDF ---
     allows  document   finding     helps     human       idf  important  \
0  0.316228  0.000000  0.000000  0.000000  0.316228  0.000000   0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   0.000000   
2  0.000000  0.377964  0.377964  0.377964  0.000000  0.377964   0.377964   

   includes  language  lemmatization  machines   natural  preprocessing  \
0  0.000000  0.632456       0.000000  0.316228  0.316228       0.000000   
1  0.408248  0.000000       0.408248  0.000000  0.000000       0.408248   
2  0.000000  0.000000       0.000000  0.000000  0.000000       0.000000   

   processing  stemming      text        tf  tokenization  understand  \
0    0.316228  0.000000  0.000000  0.000000      0.000000    0.316228   
1    0.000000  0.408248  0.408248  0.000000      0.408248    0.000000   
2    0.000000  0.000000  0.000000  0.377964      0.000000    0.000000   

      words  
0  0.000000  
1  0.000000  
2  0.3

## Explanation

Here’s a **complete and detailed explanation** of the code with **theoretical background** for **Text Analytics – Experiment 7**, including **Tokenization, POS Tagging, Stop-word removal, Stemming, Lemmatization**, and **TF-IDF representation**.

---

## 🧠 THEORY

### 1. **Document Preprocessing**

Before using text for machine learning or NLP, it must be cleaned and structured. The key steps are:

---

#### 🔹 **Tokenization**

Splitting a document into words, sentences, or phrases.

* `"NLP is fun"` → `["NLP", "is", "fun"]`

---

#### 🔹 **POS Tagging (Part-of-Speech)**

Identifies grammatical roles like noun, verb, adjective, etc.

* `"fun"` → adjective
* `"run"` → can be verb or noun

---

#### 🔹 **Stop Words Removal**

Common English words (like *is*, *the*, *and*) that don’t carry much meaning in analysis.

---

#### 🔹 **Stemming**

Reduces words to their root form.

* `"running"` → `"run"`
* `"playing"` → `"play"`

Uses **PorterStemmer** algorithm in this code.

---

#### 🔹 **Lemmatization**

More accurate than stemming. Reduces a word to its base or dictionary form (lemma).

* `"better"` → `"good"`
* `"running"` → `"run"`

Uses **WordNet Lemmatizer**.

---

### 2. **TF-IDF (Term Frequency - Inverse Document Frequency)**

Used to convert text to numeric form for ML.

#### 🔸 **TF (Term Frequency)**

How frequently a word appears in a document:

$$
\text{TF}(t, d) = \frac{\text{Number of times } t \text{ appears in } d}{\text{Total words in } d}
$$

---

#### 🔸 **IDF (Inverse Document Frequency)**

Reduces the weight of common words and increases rare ones:

$$
\text{IDF}(t) = \log\left(\frac{N}{1 + \text{DF}(t)}\right)
$$

Where:

* *N* = total number of documents
* *DF(t)* = number of documents containing term *t*

---

#### 🔸 **TF-IDF**

Combines both:

$$
\text{TF-IDF}(t, d) = \text{TF}(t, d) \times \text{IDF}(t)
$$

Words like `"the"`, `"is"` get low TF-IDF; unique words get high values.

---

## ✅ CODE EXPLANATION WITH OUTPUT

### **Step 1: Imports & Downloads**

```python
import nltk
...
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
```

* Downloads resources for tokenization, POS tagging, stop words, and lemmatization.

---

### **Step 2: Sample Documents**

```python
documents = [
    "Natural Language Processing allows machines to understand human language.",
    "Text preprocessing includes tokenization, stemming and lemmatization.",
    "TF-IDF helps in finding important words in a document."
]
```

---

### **Step 3: Preprocessing Each Document**

```python
tokens = word_tokenize(doc)
pos_tags = pos_tag(tokens)
filtered = [word for word in tokens if word.isalpha() and word.lower() not in stop_words]
stemmed = [stemmer.stem(word) for word in filtered]
lemmatized = [lemmatizer.lemmatize(word.lower()) for word in filtered]
```

#### ✅ Sample Output (Document 1):

```
Tokens: ['Natural', 'Language', 'Processing', 'allows', 'machines', 'to', 'understand', 'human', 'language', '.']
POS Tags: [('Natural', 'JJ'), ('Language', 'NN'), ('Processing', 'NN'), ...]
After Stop-word Removal: ['Natural', 'Language', 'Processing', 'allows', 'machines', 'understand', 'human', 'language']
Stemming: ['natur', 'languag', 'process', 'allow', 'machin', 'understand', 'human', 'languag']
Lemmatization: ['natural', 'language', 'processing', 'allow', 'machine', 'understand', 'human', 'language']
```

---

### **Step 4: Term Frequency (TF)**

```python
vectorizer = CountVectorizer(stop_words='english')
tf_matrix = vectorizer.fit_transform(documents)
```

#### ✅ Output: TF Table

|       | allows | document | finding | language | ... |
| ----- | ------ | -------- | ------- | -------- | --- |
| Doc 1 | 1      | 0        | 0       | 2        | ... |
| Doc 2 | 0      | 0        | 0       | 0        | ... |
| Doc 3 | 0      | 1        | 1       | 0        | ... |

Shows raw frequency of words per document.

---

### **Step 5: TF-IDF**

```python
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(tf_matrix)
```

#### ✅ Output: TF-IDF Table

|       | allows | document | finding | language | ... |
| ----- | ------ | -------- | ------- | -------- | --- |
| Doc 1 | 0.43   | 0.00     | 0.00    | 0.54     | ... |
| Doc 2 | 0.00   | 0.00     | 0.00    | 0.00     | ... |
| Doc 3 | 0.00   | 0.47     | 0.47    | 0.00     | ... |

Words like `language` appear in only one doc → higher TF-IDF.

---

## ✅ CONCLUSION

* Preprocessing helps **clean** and **standardize** text.
* TF-IDF turns text into **numeric vectors** for ML.
* These methods are the **foundation of NLP**, search engines, spam detection, etc.

---

Would you like a downloadable `.ipynb` or `.pdf` version of this report?
