### Setting Up the Environment

In [None]:
!pip install pandas numpy scikit-learn spacy transformers flask fastapi matplotlib seaborn

### Data Ingestion

In [None]:
import os
import pandas as pd
from pdfminer.high_level import extract_text

# Function to extract text from PDFs
def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

# Function to load data from a folder of PDFs
def load_data_from_folder(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(file_path)
            data.append({"filename": filename, "text": text})
    return pd.DataFrame(data)

# Load data
data_folder = "path/to/letters"
df = load_data_from_folder(data_folder)
print(df.head())

### Text Preprocessing

In [None]:
import re
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Function to clean text
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    # Convert to lowercase
    text = text.lower()
    return text

# Function to preprocess text using spaCy
def preprocess_text(text):
    doc = nlp(text)
    # Remove stopwords and lemmatize
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

# Apply preprocessing to the dataset
df["cleaned_text"] = df["text"].apply(clean_text)
df["processed_text"] = df["cleaned_text"].apply(preprocess_text)
print(df.head())

### Issue Categorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Define issue categories (example)
categories = ["Air Pollution", "Traffic Congestion", "Potholes", "Noise Pollution"]

# Sample labeled data (replace with your own)
labeled_data = [
    {"text": "The air quality in our area is terrible.", "category": "Air Pollution"},
    {"text": "The traffic on Main Street is unbearable.", "category": "Traffic Congestion"},
    {"text": "There are potholes everywhere on Elm Road.", "category": "Potholes"},
]

# Convert labeled data to DataFrame
labeled_df = pd.DataFrame(labeled_data)

# Create a text classification pipeline
model = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", MultinomialNB())
])

# Train the model
model.fit(labeled_df["text"], labeled_df["category"])

# Predict categories for new letters
df["predicted_category"] = model.predict(df["processed_text"])
print(df[["text", "predicted_category"]].head())

### Sentiment Analysis

In [None]:
from transformers import pipeline

# Load pre-trained sentiment analysis model
sentiment_analyzer = pipeline("sentiment-analysis")

# Function to analyze sentiment
def analyze_sentiment(text):
    result = sentiment_analyzer(text)[0]
    return result["label"], result["score"]

# Apply sentiment analysis to the dataset
df["sentiment"] = df["cleaned_text"].apply(lambda x: analyze_sentiment(x)[0])
df["sentiment_score"] = df["cleaned_text"].apply(lambda x: analyze_sentiment(x)[1])
print(df[["text", "sentiment", "sentiment_score"]].head())

### Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot issue categories
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x="predicted_category", order=df["predicted_category"].value_counts().index)
plt.title("Distribution of Issue Categories")
plt.xticks(rotation=45)
plt.show()

# Plot sentiment distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x="sentiment")
plt.title("Distribution of Sentiment")
plt.show()

### Exporting Results

In [None]:
# Save results to CSV
df.to_csv("processed_letters.csv", index=False)

# Save results to JSON
df.to_json("processed_letters.json", orient="records")

### Building an API

In [None]:
from flask import Flask, request, jsonify

app = Flask(__name__)

@app.route("/predict", methods=["POST"])
def predict():
    data = request.json
    text = data["text"]
    category = model.predict([text])[0]
    sentiment = analyze_sentiment(text)[0]
    return jsonify({"category": category, "sentiment": sentiment})

if __name__ == "__main__":
    app.run(debug=True)