# NLP Analysis of Public Complaints - Pipeline Development

This notebook develops the NLP pipeline for analyzing public complaint data including:
- Sentiment Analysis
- Complaint Categorization
- Keyword Extraction
- Word Cloud Generation

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import LatentDirichletAllocation
import nltk
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)

## 1. Data Loading and Preprocessing

In [None]:
def load_complaint_data(file_path):
    """
    Load complaint data from CSV or Excel file
    """
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    elif file_path.endswith(('.xlsx', '.xls')):
        df = pd.read_excel(file_path)
    else:
        raise ValueError("Unsupported file format. Please use CSV or Excel files.")
    
    return df

def preprocess_text(text):
    """
    Clean and preprocess text data
    """
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

## 2. Sentiment Analysis

In [None]:
def analyze_sentiment(text):
    """
    Perform sentiment analysis using TextBlob
    Returns sentiment polarity (-1 to 1) and subjectivity (0 to 1)
    """
    if not text or pd.isna(text):
        return 0, 0, 'neutral'
    
    blob = TextBlob(str(text))
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    
    # Classify sentiment
    if polarity > 0.1:
        sentiment_label = 'positive'
    elif polarity < -0.1:
        sentiment_label = 'negative'
    else:
        sentiment_label = 'neutral'
    
    return polarity, subjectivity, sentiment_label

def batch_sentiment_analysis(texts):
    """
    Perform sentiment analysis on a list of texts
    """
    results = []
    for text in texts:
        polarity, subjectivity, label = analyze_sentiment(text)
        results.append({
            'polarity': polarity,
            'subjectivity': subjectivity,
            'sentiment': label
        })
    return pd.DataFrame(results)

## 3. Complaint Categorization

In [None]:
def categorize_complaints(texts, n_categories=5):
    """
    Categorize complaints using K-means clustering on TF-IDF vectors
    """
    # Preprocess texts
    processed_texts = [preprocess_text(text) for text in texts]
    
    # Remove empty texts
    processed_texts = [text for text in processed_texts if text.strip()]
    
    if len(processed_texts) < n_categories:
        n_categories = max(1, len(processed_texts))
    
    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer(
        max_features=100,
        stop_words='english',
        ngram_range=(1, 2)
    )
    
    tfidf_matrix = vectorizer.fit_transform(processed_texts)
    
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=n_categories, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(tfidf_matrix)
    
    # Get top terms for each cluster
    feature_names = vectorizer.get_feature_names_out()
    cluster_labels = []
    
    for i in range(n_categories):
        top_indices = kmeans.cluster_centers_[i].argsort()[-3:][::-1]
        top_terms = [feature_names[idx] for idx in top_indices]
        cluster_labels.append(' '.join(top_terms))
    
    return clusters, cluster_labels, vectorizer, kmeans

## 4. Keyword Extraction

In [None]:
def extract_keywords(texts, top_n=20):
    """
    Extract top keywords using TF-IDF
    """
    # Preprocess texts
    processed_texts = [preprocess_text(text) for text in texts if text and not pd.isna(text)]
    
    if not processed_texts:
        return []
    
    # Create TF-IDF vectors
    vectorizer = TfidfVectorizer(
        max_features=top_n * 2,
        stop_words='english',
        ngram_range=(1, 2)
    )
    
    tfidf_matrix = vectorizer.fit_transform(processed_texts)
    
    # Get feature names and scores
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = tfidf_matrix.sum(axis=0).A1
    
    # Create keyword-score pairs
    keyword_scores = list(zip(feature_names, tfidf_scores))
    keyword_scores.sort(key=lambda x: x[1], reverse=True)
    
    return keyword_scores[:top_n]

## 5. Word Cloud Generation

In [None]:
def generate_wordcloud(texts, width=800, height=400):
    """
    Generate word cloud from complaint texts
    """
    # Combine all texts
    combined_text = ' '.join([preprocess_text(text) for text in texts if text and not pd.isna(text)])
    
    if not combined_text.strip():
        return None
    
    # Create word cloud
    wordcloud = WordCloud(
        width=width,
        height=height,
        background_color='white',
        stopwords=set(['complaint', 'issue', 'problem', 'service', 'customer']),
        max_words=100,
        colormap='viridis'
    ).generate(combined_text)
    
    return wordcloud

## 6. Complete NLP Pipeline Function

In [None]:
def complete_nlp_analysis(df, text_column):
    """
    Perform complete NLP analysis on complaint data
    """
    results = {}
    
    # Get text data
    texts = df[text_column].fillna('').tolist()
    
    # 1. Sentiment Analysis
    print("Performing sentiment analysis...")
    sentiment_results = batch_sentiment_analysis(texts)
    results['sentiment'] = sentiment_results
    
    # 2. Complaint Categorization
    print("Categorizing complaints...")
    clusters, cluster_labels, vectorizer, kmeans = categorize_complaints(texts)
    results['categories'] = {
        'clusters': clusters,
        'labels': cluster_labels,
        'vectorizer': vectorizer,
        'model': kmeans
    }
    
    # 3. Keyword Extraction
    print("Extracting keywords...")
    keywords = extract_keywords(texts)
    results['keywords'] = keywords
    
    # 4. Word Cloud
    print("Generating word cloud...")
    wordcloud = generate_wordcloud(texts)
    results['wordcloud'] = wordcloud
    
    return results

## 7. Test with Sample Data

In [None]:
# Create sample complaint data for testing
sample_complaints = [
    "The internet service is extremely slow and keeps disconnecting",
    "Billing department charged me twice for the same service",
    "Customer service representative was very rude and unhelpful",
    "Water supply has been disrupted for three days without notice",
    "Garbage collection has been irregular in our neighborhood",
    "Road maintenance is poor, many potholes need fixing",
    "Public transportation is always delayed and overcrowded",
    "Hospital staff provided excellent care during emergency",
    "Library services have improved significantly this year",
    "Park maintenance team does a great job keeping it clean"
]

# Create sample DataFrame
sample_df = pd.DataFrame({
    'complaint_id': range(1, len(sample_complaints) + 1),
    'complaint_text': sample_complaints,
    'date': pd.date_range('2024-01-01', periods=len(sample_complaints), freq='D')
})

print("Sample Data:")
print(sample_df.head())

In [None]:
# Test the complete NLP pipeline
results = complete_nlp_analysis(sample_df, 'complaint_text')

print("\n=== NLP Analysis Results ===")
print(f"\nSentiment Distribution:")
print(results['sentiment']['sentiment'].value_counts())

print(f"\nTop Keywords:")
for keyword, score in results['keywords'][:10]:
    print(f"{keyword}: {score:.3f}")

print(f"\nComplaint Categories:")
for i, label in enumerate(results['categories']['labels']):
    count = sum(1 for c in results['categories']['clusters'] if c == i)
    print(f"Category {i+1}: {label} ({count} complaints)")

In [None]:
# Visualize results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Sentiment distribution
sentiment_counts = results['sentiment']['sentiment'].value_counts()
axes[0, 0].pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%')
axes[0, 0].set_title('Sentiment Distribution')

# Sentiment polarity histogram
axes[0, 1].hist(results['sentiment']['polarity'], bins=10, alpha=0.7)
axes[0, 1].set_title('Sentiment Polarity Distribution')
axes[0, 1].set_xlabel('Polarity')
axes[0, 1].set_ylabel('Frequency')

# Category distribution
category_counts = pd.Series(results['categories']['clusters']).value_counts().sort_index()
axes[1, 0].bar(range(len(category_counts)), category_counts.values)
axes[1, 0].set_title('Complaint Categories')
axes[1, 0].set_xlabel('Category')
axes[1, 0].set_ylabel('Count')

# Top keywords
top_keywords = results['keywords'][:8]
keywords, scores = zip(*top_keywords)
axes[1, 1].barh(range(len(keywords)), scores)
axes[1, 1].set_yticks(range(len(keywords)))
axes[1, 1].set_yticklabels(keywords)
axes[1, 1].set_title('Top Keywords')
axes[1, 1].set_xlabel('TF-IDF Score')

plt.tight_layout()
plt.show()

In [None]:
# Display word cloud
if results['wordcloud']:
    plt.figure(figsize=(12, 6))
    plt.imshow(results['wordcloud'], interpolation='bilinear')
    plt.axis('off')
    plt.title('Complaint Word Cloud', fontsize=16)
    plt.show()
else:
    print("No word cloud generated - insufficient text data")