# Week 6, Day 6: NLP Hackathon Challenge

## Challenge Overview
Build an end-to-end NLP solution using concepts learned throughout Week 6:
- Text Processing
- Classification
- Named Entity Recognition
- Topic Modeling
- Language Models

## Problem: Multi-Task NLP System
Create a system that can perform multiple NLP tasks on news articles:
1. Article Classification
2. Entity Extraction
3. Topic Analysis
4. Summary Generation

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import nltk
import spacy
from transformers import pipeline
from gensim import corpora, models
import tensorflow as tf

## Part 1: Data Generation and Preparation

In [None]:
def generate_sample_data(n_samples=100):
    """Generate synthetic news articles dataset"""
    
    # Categories
    categories = ['Technology', 'Business', 'Sports', 'Politics']
    
    # Template sentences
    tech_templates = [
        "Apple announced new {product} with advanced {feature}.",
        "Google develops AI system for {application}.",
        "Microsoft releases update for {software}."
    ]
    
    business_templates = [
        "{company} reports {percent}% growth in Q3.",
        "Stock market sees {direction} trend due to {factor}.",
        "Merger announced between {company1} and {company2}."
    ]
    
    sports_templates = [
        "{team} wins championship with score {score}.",
        "Player {name} breaks record in {sport}.",
        "Olympic committee announces {decision} for {event}."
    ]
    
    politics_templates = [
        "President {name} announces new policy on {issue}.",
        "Senate votes on {bill} legislation.",
        "International summit discusses {topic}."
    ]
    
    # Generate articles
    articles = []
    labels = []
    
    for _ in range(n_samples):
        category = np.random.choice(categories)
        
        if category == 'Technology':
            template = np.random.choice(tech_templates)
            article = template.format(
                product=np.random.choice(['iPhone', 'MacBook', 'iPad']),
                feature=np.random.choice(['AI', '5G', 'security'])
            )
        elif category == 'Business':
            template = np.random.choice(business_templates)
            article = template.format(
                company=np.random.choice(['Amazon', 'Tesla', 'Netflix']),
                percent=np.random.randint(5, 30),
                direction=np.random.choice(['upward', 'downward']),
                factor=np.random.choice(['inflation', 'growth', 'policy']),
                company1=np.random.choice(['Company A', 'Company B']),
                company2=np.random.choice(['Company C', 'Company D'])
            )
        elif category == 'Sports':
            template = np.random.choice(sports_templates)
            article = template.format(
                team=np.random.choice(['Lakers', 'Warriors', 'Bulls']),
                score=f"{np.random.randint(80, 120)}-{np.random.randint(70, 110)}",
                name=np.random.choice(['John', 'Mike', 'Sarah']),
                sport=np.random.choice(['basketball', 'football', 'tennis']),
                decision=np.random.choice(['new rules', 'venue change']),
                event=np.random.choice(['2024 Games', 'World Cup'])
            )
        else:  # Politics
            template = np.random.choice(politics_templates)
            article = template.format(
                name=np.random.choice(['Smith', 'Johnson', 'Brown']),
                issue=np.random.choice(['climate', 'economy', 'healthcare']),
                bill=np.random.choice(['energy', 'tax', 'education']),
                topic=np.random.choice(['trade', 'climate change', 'security'])
            )
        
        articles.append(article)
        labels.append(category)
    
    return pd.DataFrame({
        'text': articles,
        'category': labels
    })

# Generate dataset
df = generate_sample_data()
print("Dataset shape:", df.shape)
print("\nSample articles:")
print(df.head())

## Task 1: Article Classification

In [None]:
def implement_classification():
    """Implement article classification"""
    # Your code here:
    # 1. Preprocess text
    # 2. Create features
    # 3. Train classifier
    # 4. Evaluate performance
    pass

# Example solution structure:
def example_classification(df):
    # Create features
    vectorizer = TfidfVectorizer(max_features=1000)
    X = vectorizer.fit_transform(df['text'])
    
    # Encode labels
    le = LabelEncoder()
    y = le.fit_transform(df['category'])
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Create and train model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64, activation='relu', input_shape=(1000,)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(len(le.classes_), activation='softmax')
    ])
    
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # Train model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=10,
        batch_size=32
    )
    
    return model, history, vectorizer, le

model, history, vectorizer, le = example_classification(df)

## Task 2: Entity Extraction

In [None]:
def implement_ner():
    """Implement named entity recognition"""
    # Your code here:
    # 1. Load NER model
    # 2. Extract entities
    # 3. Analyze results
    # 4. Visualize entities
    pass

# Example solution structure:
def example_ner(df):
    # Load spaCy model
    nlp = spacy.load('en_core_web_sm')
    
    # Extract entities
    entities = []
    for text in df['text']:
        doc = nlp(text)
        doc_entities = [(ent.text, ent.label_) for ent in doc.ents]
        entities.append(doc_entities)
    
    return entities

entities = example_ner(df)

## Task 3: Topic Analysis

In [None]:
def implement_topic_modeling():
    """Implement topic modeling"""
    # Your code here:
    # 1. Preprocess text
    # 2. Create topic model
    # 3. Extract topics
    # 4. Visualize results
    pass

# Example solution structure:
def example_topic_modeling(df):
    # Tokenize text
    texts = [
        word_tokenize(text.lower()) for text in df['text']
    ]
    
    # Create dictionary
    dictionary = corpora.Dictionary(texts)
    
    # Create corpus
    corpus = [dictionary.doc2bow(text) for text in texts]
    
    # Train LDA model
    lda_model = models.LdaModel(
        corpus,
        num_topics=4,
        id2word=dictionary
    )
    
    return lda_model, corpus, dictionary

lda_model, corpus, dictionary = example_topic_modeling(df)

## Task 4: Summary Generation

In [None]:
def implement_summarization():
    """Implement text summarization"""
    # Your code here:
    # 1. Load summarization model
    # 2. Generate summaries
    # 3. Evaluate quality
    # 4. Compare results
    pass

# Example solution structure:
def example_summarization(df):
    # Initialize summarizer
    summarizer = pipeline('summarization')
    
    # Generate summaries
    summaries = []
    for text in df['text']:
        if len(text.split()) > 10:  # Check if text is long enough
            summary = summarizer(text, max_length=30, min_length=10)[0]['summary_text']
        else:
            summary = text
        summaries.append(summary)
    
    return summaries

summaries = example_summarization(df)

## Evaluation Criteria

Your solution will be evaluated based on:

1. Classification (25%)
   - Model accuracy
   - Feature engineering
   - Implementation quality

2. Entity Extraction (25%)
   - Entity coverage
   - Accuracy
   - Error analysis

3. Topic Analysis (25%)
   - Topic coherence
   - Topic interpretability
   - Visualization

4. Summary Generation (25%)
   - Summary quality
   - Conciseness
   - Information retention

## Submission Guidelines
1. Complete all tasks in this notebook
2. Document your approach and decisions
3. Include visualizations and analysis
4. Provide suggestions for improvement