# Natural Language Processing (NLP)

## Learning Objectives
By the end of this lesson, you will be able to:
- Process and analyze text data
- Build sentiment analysis systems
- Use pre-trained language models
- Create text classification applications

## Core Concepts
- **Tokenization**: Breaking text into words or pieces
- **Embeddings**: Converting words to numbers that capture meaning
- **Transformer**: Modern AI architecture that understands context
- **Sentiment Analysis**: Determining if text is positive, negative, or neutral
- **Classification**: Automatically categorizing text (spam, topics, etc.)

## 1. Text Processing Basics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
import warnings
warnings.filterwarnings('ignore')

print("📝 TEXT PROCESSING FUNDAMENTALS")

# Sample texts for analysis
texts = [
    "I love this product! It's amazing and works perfectly.",
    "This is terrible. Worst purchase ever. Totally disappointed.",
    "The item is okay. Nothing special but does the job.",
    "Fantastic! Exceeded my expectations. Highly recommend!",
    "Not bad, could be better. Average quality for the price.",
    "Absolutely horrible. Waste of money. Do not buy!",
    "Great value! Works as described. Very satisfied.",
    "Meh. It's fine I guess. Neither good nor bad."
]

labels = [1, 0, 0, 1, 0, 0, 1, 0]  # 1=positive, 0=negative/neutral

print("Sample reviews:")
for i, (text, label) in enumerate(zip(texts, labels)):
    sentiment = "Positive" if label == 1 else "Negative/Neutral"
    print(f"{i+1}. {text}")
    print(f"   → {sentiment}\n")

# Basic text cleaning
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

print("🧹 TEXT CLEANING:")
sample_text = "I LOVE this!!! It's amazing... Really great product."
cleaned = clean_text(sample_text)
print(f"Original: {sample_text}")
print(f"Cleaned:  {cleaned}")

# Tokenization (breaking into words)
print(f"\n🔤 TOKENIZATION:")
words = cleaned.split()
print(f"Words: {words}")
print(f"Number of words: {len(words)}")

# Word frequency analysis
all_text = ' '.join([clean_text(t) for t in texts])
word_freq = {}
for word in all_text.split():
    word_freq[word] = word_freq.get(word, 0) + 1

# Show top words
top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
print(f"\nMost common words:")
for word, count in top_words:
    print(f"'{word}': {count} times")

## 2. Sentiment Analysis Model

In [None]:
# Build sentiment analysis model
print("🎯 BUILDING SENTIMENT ANALYSIS MODEL")

# Create larger dataset for training
def generate_reviews():
    positive_reviews = [
        "amazing product love it", "fantastic quality great value", "excellent service recommend",
        "wonderful experience happy customer", "outstanding performance very satisfied",
        "brilliant design works perfectly", "incredible quality exceeded expectations",
        "superb product definitely recommend", "awesome features great price",
        "perfect solution exactly needed"
    ]
    
    negative_reviews = [
        "terrible quality waste money", "horrible experience never again", "awful product disappointed",
        "worst purchase ever regret", "cheap quality broke immediately", "useless product poor design",
        "disappointing results not worth", "bad service poor quality", "failed expectations terrible",
        "completely useless avoid buying"
    ]
    
    # Combine and create labels
    all_reviews = positive_reviews + negative_reviews
    all_labels = [1] * len(positive_reviews) + [0] * len(negative_reviews)
    
    return all_reviews, all_labels

reviews, sentiment_labels = generate_reviews()
print(f"Dataset: {len(reviews)} reviews ({sum(sentiment_labels)} positive, {len(sentiment_labels)-sum(sentiment_labels)} negative)")

# Convert text to numbers using TF-IDF
vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
X = vectorizer.fit_transform(reviews)
y = np.array(sentiment_labels)

print(f"Text converted to {X.shape[1]} features")

# Split and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate model
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"Model accuracy: {accuracy:.1%}")

# Test on new reviews
test_reviews = [
    "This product is absolutely fantastic!",
    "Completely disappointed with this purchase",
    "Pretty good, works as expected"
]

print(f"\n🔍 TESTING ON NEW REVIEWS:")
for review in test_reviews:
    # Clean and vectorize
    cleaned_review = clean_text(review)
    review_vector = vectorizer.transform([cleaned_review])
    
    # Predict
    prediction = model.predict(review_vector)[0]
    probability = model.predict_proba(review_vector)[0]
    
    sentiment = "Positive" if prediction == 1 else "Negative"
    confidence = max(probability)
    
    print(f"'{review}'")
    print(f"→ {sentiment} (confidence: {confidence:.2f})")

# Show important words
feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_[0]

# Top positive words
pos_indices = coefficients.argsort()[-5:][::-1]
print(f"\nTop positive words:")
for idx in pos_indices:
    print(f"'{feature_names[idx]}': {coefficients[idx]:.3f}")

# Top negative words  
neg_indices = coefficients.argsort()[:5]
print(f"\nTop negative words:")
for idx in neg_indices:
    print(f"'{feature_names[idx]}': {coefficients[idx]:.3f}")

## 3. Advanced Text Analysis

In [None]:
# Text classification and topic analysis
print("📊 ADVANCED TEXT ANALYSIS")

# Topic classification example
topics_data = {
    'text': [
        "The stock market rose today with tech companies leading gains",
        "New smartphone features include better camera and longer battery", 
        "Scientists discover new treatment for cancer using AI",
        "Football team wins championship after dramatic final game",
        "Recipe for delicious chocolate cake with simple ingredients",
        "Investment strategies for retirement planning and wealth building",
        "Latest iPhone review compares features with Android phones",
        "Medical research shows benefits of exercise for heart health",
        "Basketball player breaks scoring record in playoff game",
        "Cooking tips for perfect pasta and Italian dishes"
    ],
    'category': ['Finance', 'Technology', 'Health', 'Sports', 'Food', 
                'Finance', 'Technology', 'Health', 'Sports', 'Food']
}

topics_df = pd.DataFrame(topics_data)
print("Topic classification dataset:")
print(topics_df)

# Convert categories to numbers
category_map = {'Finance': 0, 'Technology': 1, 'Health': 2, 'Sports': 3, 'Food': 4}
topics_df['label'] = topics_df['category'].map(category_map)

# Build multi-class classifier
topic_vectorizer = TfidfVectorizer(max_features=50, stop_words='english')
X_topics = topic_vectorizer.fit_transform(topics_df['text'])
y_topics = topics_df['label']

topic_model = LogisticRegression(multi_class='ovr')
topic_model.fit(X_topics, y_topics)

# Test topic classification
test_texts = [
    "Apple stock price increases after earnings report",
    "New Android phone has amazing camera quality", 
    "Doctor recommends healthy diet for diabetes",
    "Tennis player wins tournament in straight sets",
    "Best pizza recipe with homemade dough"
]

print(f"\n🎯 TOPIC CLASSIFICATION:")
for text in test_texts:
    text_vector = topic_vectorizer.transform([text])
    prediction = topic_model.predict(text_vector)[0]
    probabilities = topic_model.predict_proba(text_vector)[0]
    
    categories = list(category_map.keys())
    predicted_category = categories[prediction]
    confidence = probabilities[prediction]
    
    print(f"'{text[:50]}...'")
    print(f"→ {predicted_category} (confidence: {confidence:.2f})")

# Practice Exercises
print(f"\n📚 PRACTICE EXERCISES:")

# Exercise 1: Email spam detection
print(f"\nExercise 1: Email Spam Detection")
emails = [
    "Meeting at 3pm in conference room A",
    "Congratulations! You won $1000000! Click here now!",
    "Project deadline moved to next Friday",
    "URGENT: Claim your prize immediately or lose forever!",
    "Can you review the quarterly report before Monday?",
    "FREE MONEY!!! No strings attached! Act now!"
]

spam_labels = [0, 1, 0, 1, 0, 1]  # 0=not spam, 1=spam
print("Build a spam detector using these emails")

# Exercise 2: Product review analysis  
print(f"\nExercise 2: Product Review Analysis")
product_reviews = [
    "Great phone, battery lasts all day",
    "Screen is too small, hard to read",
    "Fast delivery, excellent packaging", 
    "Expensive but worth the quality",
    "Poor customer service experience"
]
print("Analyze these reviews for different aspects (battery, screen, service, etc.)")

# Exercise 3: Keyword extraction
print(f"\nExercise 3: Keyword Extraction")
document = "Machine learning is transforming healthcare by enabling AI systems to analyze medical data and predict patient outcomes"
words = document.lower().split()
word_counts = {}
for word in words:
    if len(word) > 3:  # Ignore short words
        word_counts[word] = word_counts.get(word, 0) + 1

print("Extract important keywords from documents")
top_keywords = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:3]
print(f"Top keywords: {[word for word, count in top_keywords]}")

# Exercise 4: Text similarity
print(f"\nExercise 4: Text Similarity") 
def simple_similarity(text1, text2):
    words1 = set(text1.lower().split())
    words2 = set(text2.lower().split())
    
    intersection = words1.intersection(words2)
    union = words1.union(words2)
    
    return len(intersection) / len(union) if union else 0

text_a = "I love machine learning and artificial intelligence"
text_b = "Machine learning and AI are fascinating topics"
similarity = simple_similarity(text_a, text_b)

print(f"Text A: {text_a}")
print(f"Text B: {text_b}")
print(f"Similarity: {similarity:.2f}")

# Key insights
print(f"\n💡 KEY INSIGHTS:")
print(f"✅ Clean text before analysis (lowercase, remove punctuation)")
print(f"✅ TF-IDF converts text to numbers while preserving meaning")
print(f"✅ Logistic regression works well for text classification")
print(f"✅ Feature selection is important for large vocabularies")
print(f"✅ Always test on new, unseen text data")

print(f"\n🚀 NEXT STEPS:")
print(f"1. Try pre-trained models (BERT, GPT)")
print(f"2. Use word embeddings (Word2Vec, GloVe)")
print(f"3. Experiment with neural networks for text")
print(f"4. Learn about transformers and attention")
print(f"5. Build chatbots and language models")