# NLP Assignment 3: Text Preprocessing and TF-IDF Representation

This notebook demonstrates:
- Text cleaning
- Lemmatization
- Stop word removal
- Label encoding
- TF-IDF vectorization
- Saving outputs

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt', quiet=True)

print("Libraries imported successfully!")

## 2. Create Sample Dataset

Creating a sample dataset with text and labels for demonstration.

In [None]:
# Sample dataset
data = {
    'text': [
        "This is a GREAT movie! I really loved it!!!",
        "The product was terrible and didn't work at all...",
        "Amazing experience! Would definitely recommend to friends.",
        "Worst purchase ever. Complete waste of money!!!",
        "It's okay, nothing special but not bad either.",
        "Absolutely fantastic! Best thing I've ever bought!",
        "Very disappointing. Expected much better quality.",
        "Pretty good overall, happy with the purchase.",
        "Not recommended!!! Poor quality and bad service.",
        "Excellent product! Worth every penny. Highly satisfied!!!"
    ],
    'label': ['positive', 'negative', 'positive', 'negative', 'neutral', 
              'positive', 'negative', 'positive', 'negative', 'positive']
}

df = pd.DataFrame(data)
print("Original Dataset:")
print(df)
print(f"\nDataset shape: {df.shape}")

## 3. Text Cleaning

Cleaning the text by:
- Converting to lowercase
- Removing special characters and punctuation
- Removing extra whitespaces
- Removing numbers (optional)

In [None]:
def clean_text(text):
    """Clean the input text"""
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply text cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

print("Text Cleaning Results:")
print(df[['text', 'cleaned_text']].head())
print(f"\nSample cleaned text:\n{df['cleaned_text'].iloc[0]}")

## 4. Lemmatization

Applying lemmatization to convert words to their base/dictionary form using NLTK's WordNetLemmatizer.

In [None]:
def lemmatize_text(text):
    """Lemmatize the input text"""
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Apply lemmatization
df['lemmatized_text'] = df['cleaned_text'].apply(lemmatize_text)

print("Lemmatization Results:")
print(df[['cleaned_text', 'lemmatized_text']].head())
print(f"\nExample:")
print(f"Before: {df['cleaned_text'].iloc[0]}")
print(f"After:  {df['lemmatized_text'].iloc[0]}")

## 5. Stop Words Removal

Removing common English stop words using NLTK's stopwords corpus.

In [None]:
def remove_stopwords(text):
    """Remove stop words from text"""
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

# Apply stop words removal
df['processed_text'] = df['lemmatized_text'].apply(remove_stopwords)

print("Stop Words Removal Results:")
print(df[['lemmatized_text', 'processed_text']].head())
print(f"\nExample:")
print(f"Before: {df['lemmatized_text'].iloc[0]}")
print(f"After:  {df['processed_text'].iloc[0]}")

# Display the full preprocessing pipeline
print("\n" + "="*80)
print("Complete Preprocessing Pipeline:")
print("="*80)
print(df[['text', 'processed_text', 'label']].to_string())

## 6. Label Encoding

Converting categorical labels to numerical values using sklearn's LabelEncoder.

In [None]:
# Initialize and fit label encoder
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

print("Label Encoding Results:")
print(df[['label', 'encoded_label']].drop_duplicates().sort_values('encoded_label'))

print("\n" + "="*50)
print("Label Mapping:")
print("="*50)
for i, label in enumerate(label_encoder.classes_):
    print(f"{label}: {i}")

print(f"\nDataset with encoded labels:")
print(df[['text', 'processed_text', 'label', 'encoded_label']])

## 7. TF-IDF Vectorization

Creating TF-IDF (Term Frequency-Inverse Document Frequency) representations of the processed text.

In [None]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2))

# Fit and transform the processed text
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])

# Convert to DataFrame for better visualization
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_vectorizer.get_feature_names_out()
)

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)
print(f"Number of documents: {tfidf_matrix.shape[0]}")
print(f"Number of features: {tfidf_matrix.shape[1]}")

print("\nTF-IDF Feature Names (first 20):")
print(tfidf_vectorizer.get_feature_names_out()[:20])

print("\nTF-IDF Matrix (first 5 documents, first 10 features):")
print(tfidf_df.iloc[:5, :10])

print("\nFull TF-IDF DataFrame:")
print(tfidf_df)

## 8. Visualization

Analyzing the most important features based on TF-IDF scores.

In [None]:
# Calculate average TF-IDF scores for each feature
feature_scores = tfidf_df.mean().sort_values(ascending=False)

print("Top 15 Features by Average TF-IDF Score:")
print("="*50)
for feature, score in feature_scores.head(15).items():
    print(f"{feature:20s}: {score:.4f}")

# Create a summary of non-zero features per document
print("\n" + "="*50)
print("Non-zero Features per Document:")
print("="*50)
for idx in range(len(df)):
    non_zero = (tfidf_df.iloc[idx] > 0).sum()
    print(f"Document {idx+1}: {non_zero} features")

## 9. Save Outputs

Saving all processed data, models, and results to files.

In [None]:
import os

# Create output directory if it doesn't exist
output_dir = 'outputs'
os.makedirs(output_dir, exist_ok=True)

# 1. Save processed dataset
df.to_csv(f'{output_dir}/processed_data.csv', index=False)
print(f"✓ Saved processed data to '{output_dir}/processed_data.csv'")

# 2. Save TF-IDF matrix as CSV
tfidf_df.to_csv(f'{output_dir}/tfidf_matrix.csv', index=False)
print(f"✓ Saved TF-IDF matrix to '{output_dir}/tfidf_matrix.csv'")

# 3. Save TF-IDF matrix as numpy array
np.save(f'{output_dir}/tfidf_matrix.npy', tfidf_matrix.toarray())
print(f"✓ Saved TF-IDF matrix (numpy) to '{output_dir}/tfidf_matrix.npy'")

# 4. Save TF-IDF vectorizer
with open(f'{output_dir}/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)
print(f"✓ Saved TF-IDF vectorizer to '{output_dir}/tfidf_vectorizer.pkl'")

# 5. Save label encoder
with open(f'{output_dir}/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
print(f"✓ Saved label encoder to '{output_dir}/label_encoder.pkl'")

# 6. Save feature names
feature_names = tfidf_vectorizer.get_feature_names_out()
np.save(f'{output_dir}/feature_names.npy', feature_names)
print(f"✓ Saved feature names to '{output_dir}/feature_names.npy'")

# 7. Save label mapping
label_mapping = {label: idx for idx, label in enumerate(label_encoder.classes_)}
with open(f'{output_dir}/label_mapping.pkl', 'wb') as f:
    pickle.dump(label_mapping, f)
print(f"✓ Saved label mapping to '{output_dir}/label_mapping.pkl'")

# 8. Create and save summary report
summary_report = f"""
NLP Assignment 3 - Processing Summary
{'='*60}

Dataset Information:
- Total documents: {len(df)}
- Unique labels: {df['label'].nunique()}
- Labels: {', '.join(df['label'].unique())}

TF-IDF Vectorization:
- Total features: {tfidf_matrix.shape[1]}
- Matrix shape: {tfidf_matrix.shape}
- Non-zero elements: {tfidf_matrix.nnz}
- Sparsity: {(1 - tfidf_matrix.nnz / (tfidf_matrix.shape[0] * tfidf_matrix.shape[1])) * 100:.2f}%

Label Encoding:
{chr(10).join([f'- {label}: {idx}' for idx, label in enumerate(label_encoder.classes_)])}

Top 10 Features by TF-IDF Score:
{chr(10).join([f'- {feature}: {score:.4f}' for feature, score in feature_scores.head(10).items()])}

Files Saved:
- processed_data.csv: Full dataset with all preprocessing steps
- tfidf_matrix.csv: TF-IDF matrix in CSV format
- tfidf_matrix.npy: TF-IDF matrix in numpy format
- tfidf_vectorizer.pkl: Trained TF-IDF vectorizer
- label_encoder.pkl: Trained label encoder
- feature_names.npy: List of TF-IDF features
- label_mapping.pkl: Label to index mapping
"""

with open(f'{output_dir}/summary_report.txt', 'w') as f:
    f.write(summary_report)
print(f"✓ Saved summary report to '{output_dir}/summary_report.txt'")

print("\n" + "="*60)
print("All outputs saved successfully!")
print("="*60)

## 10. Summary

This notebook completed the following tasks:

1. **Text Cleaning**: Converted text to lowercase, removed special characters and extra whitespaces
2. **Lemmatization**: Applied WordNetLemmatizer to convert words to their base form
3. **Stop Words Removal**: Removed common English stop words using NLTK
4. **Label Encoding**: Converted categorical labels to numerical values
5. **TF-IDF Vectorization**: Created Term Frequency-Inverse Document Frequency representations
6. **Output Saving**: Saved all processed data, models, and reports to the `outputs/` directory

All outputs are saved and ready for further analysis or machine learning tasks!

## Bonus: Loading Saved Models (Optional)

Example of how to load and use the saved models for new data.

In [None]:
# Example: Load saved models and process new text
def process_new_text(new_text):
    """Process new text using saved models"""
    # Load the vectorizer
    with open(f'{output_dir}/tfidf_vectorizer.pkl', 'rb') as f:
        loaded_vectorizer = pickle.load(f)
    
    # Load the label encoder
    with open(f'{output_dir}/label_encoder.pkl', 'rb') as f:
        loaded_encoder = pickle.load(f)
    
    # Process the new text
    cleaned = clean_text(new_text)
    lemmatized = lemmatize_text(cleaned)
    processed = remove_stopwords(lemmatized)
    
    # Transform using TF-IDF
    tfidf_vector = loaded_vectorizer.transform([processed])
    
    return processed, tfidf_vector

# Test with new text
new_text = "This product is absolutely wonderful! I love it so much!!!"
processed, tfidf_vec = process_new_text(new_text)

print("Example: Processing New Text")
print("="*60)
print(f"Original: {new_text}")
print(f"Processed: {processed}")
print(f"TF-IDF vector shape: {tfidf_vec.shape}")
print(f"Non-zero features: {tfidf_vec.nnz}")
print("\nTop 5 TF-IDF scores:")
feature_idx = tfidf_vec.toarray()[0].argsort()[-5:][::-1]
with open(f'{output_dir}/tfidf_vectorizer.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)
features = loaded_vectorizer.get_feature_names_out()
for idx in feature_idx:
    if tfidf_vec.toarray()[0][idx] > 0:
        print(f"  {features[idx]}: {tfidf_vec.toarray()[0][idx]:.4f}")