### Setting Up the Environment

In [38]:
import re
import emoji
import contractions
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [39]:
# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1002)>


False

In [33]:
def load_data(file_path):
    """
    Load data from a CSV file.

    """
    try:
        data = pd.read_csv(file_path, on_bad_lines='skip')
        print("Data loaded successfully.")
        return data
    except Exception as e:
        print("Error loading data:", e)
        return None

In [26]:
def analyze_data(df):
    """
    Analyze the data by printing summaries and distributions.

    """
    print("Data Shape:", df.shape)
    print("\nFirst five rows:")
    print(df.head())

    print("\nSeverity distribution:")
    print(df['severity'].value_counts())

    print("\nSentiment distribution:")
    print(df['sentiment'].value_counts())

In [27]:
def visualize_data(df):
    """
    Visualize key distributions in the data.

    """
    # Distribution of issue categories
    plt.figure(figsize=(10, 5))
    sns.countplot(x='issue_category', data=df, palette="viridis")
    plt.title("Issue Category Distribution")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # Distribution of sentiment
    plt.figure(figsize=(6, 4))
    sns.countplot(x='sentiment', data=df, palette="magma")
    plt.title("Sentiment Distribution")
    plt.tight_layout()
    plt.show()

In [28]:

def remove_emojis(text):
    """Remove emojis from the text."""
    return emoji.replace_emoji(text, replace="")

def expand_contractions(text):
    """Expand contractions, e.g., don't -> do not."""
    return contractions.fix(text)

def remove_urls(text):
    """Remove URLs from the text."""
    return re.sub(r'http\S+|www\S+', '', text)

def remove_mentions_hashtags(text):
    """Remove social media mentions and hashtags."""
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    return text

def remove_punctuation(text):
    """Remove punctuation characters from the text."""
    return re.sub(r'[^\w\s]', '', text)

def remove_numbers(text):
    """Remove digits from the text."""
    return re.sub(r'\d+', '', text)

def normalize_repeated_chars(text):
    """
    Normalize words with excessive repeated characters.
    E.g., 'soooo' becomes 'so'.
    """
    return re.sub(r'(.)\1{2,}', r'\1', text)

def remove_extra_whitespace(text):
    """Remove extra whitespace and trim the text."""
    return re.sub(r'\s+', ' ', text).strip()

def tokenize_and_lower(text):
    """Tokenize the text and convert tokens to lowercase."""
    tokens = word_tokenize(text)
    return [token.lower() for token in tokens]

def remove_stopwords(tokens):
    """Remove stopwords from the list of tokens."""
    stop_words = set(stopwords.words('english'))
    return [token for token in tokens if token not in stop_words]

def lemmatize_tokens(tokens):
    """Lemmatize tokens to their base form."""
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]



def preprocess_text(text):
    """
    Clean and preprocess text using lemmatization.

    Steps:
    - Remove emojis.
    - Expand contractions.
    - Remove punctuation, URLs, mentions, hashtags, and numbers.
    - Normalize repeated characters.
    - Tokenize and lowercase.
    - Remove stopwords.
    - Lemmatize tokens.
    """
    text = remove_emojis(text)
    text = expand_contractions(text)
    text = remove_urls(text)
    text = remove_mentions_hashtags(text)
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = normalize_repeated_chars(text)
    text = remove_extra_whitespace(text)

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


def remove_duplicates(df, column='text'):
    """
    Remove duplicate rows based on a specific column.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        column (str): The column on which to base duplicate removal.

    Returns:
        pd.DataFrame: DataFrame with duplicate rows removed.
    """
    before = df.shape[0]
    df = df.drop_duplicates(subset=[column]).reset_index(drop=True)
    after = df.shape[0]
    print(f"Removed {before - after} duplicate rows.")
    return df


def comprehensive_process_data(df):
    """
    Process and clean the data with comprehensive text preprocessing.

    This function:
    - Removes duplicate rows based on the 'text' column.
    - Applies full text cleaning and preprocessing using both lemmatization and stemming.
    - Processes other columns (e.g., issue_category, severity, sentiment) as needed.

    Args:
        df (pd.DataFrame): The raw data DataFrame.

    Returns:
        pd.DataFrame: The cleaned and preprocessed DataFrame.
    """
    # Remove duplicate rows based on the 'text' column
    df = remove_duplicates(df, column='text')

    # Apply comprehensive text preprocessing
    df['text_clean'] = df['text'].apply(preprocess_text)
    df['text_stemmed'] = df['text'].apply(preprocess_text_stemming)

    # Process additional columns as needed
    df['issue_category'] = df['issue_category'].astype(str).str.strip()
    df['severity'] = df['severity'].fillna('Unknown')
    df['sentiment'] = df['sentiment'].fillna('Neutral')

    print("Comprehensive text processing complete.")
    return df


In [35]:
file_path = "../Data/processed/community_issues_dataset_template.csv"


load_data(file_path)


Data loaded successfully.


Unnamed: 0,text,issue_category,severity,sentiment
0,"Dear Sir/Madam, I am writing to express my dee...",Air Pollution,High,Negative
1,"Dear Council Team, I am writing to report the ...",Traffic Congestion,Medium,Negative
2,"To Whom It May Concern, I wish to bring to you...",Potholes,High,Negative
3,"Dear Sir/Madam, I am writing to highlight the ...",Noise Pollution,Low,Negative
4,"Dear Council Representative, I am compelled to...",Lack of Affordable Housing,High,Negative
...,...,...,...,...
1422,"Dear Council, I’m writing to express my concer...",Crime Rates in Urban Areas,Medium,Neutral
1423,"Dear Council, I’m writing to share my positive...",Crime Rates in Urban Areas,Low,Positive
1424,"Dear Council, I’m writing to express my concer...",Crime Rates in Urban Areas,High,Negative
1425,"Dear Council, I’m writing to share my neutral ...",Crime Rates in Urban Areas,Medium,Neutral


In [37]:
# Convert CSV to DataFrame using load_data()
df = load_data(file_path)

analyze_data(df)

Data loaded successfully.
Data Shape: (1427, 4)

First five rows:
                                                text  \
0  Dear Sir/Madam, I am writing to express my dee...   
1  Dear Council Team, I am writing to report the ...   
2  To Whom It May Concern, I wish to bring to you...   
3  Dear Sir/Madam, I am writing to highlight the ...   
4  Dear Council Representative, I am compelled to...   

               issue_category severity sentiment  
0               Air Pollution     High  Negative  
1          Traffic Congestion   Medium  Negative  
2                    Potholes     High  Negative  
3             Noise Pollution      Low  Negative  
4  Lack of Affordable Housing     High  Negative  

Severity distribution:
severity
High        1147
Medium       157
Low          121
severity       1
Name: count, dtype: int64

Sentiment distribution:
sentiment
Negative     1150
Positive      152
Neutral       123
sentiment       1
Name: count, dtype: int64


In [29]:

# Example usage within your main pipeline
if __name__ == '__main__':
    # Update the file path to your data CSV file
    file_path = "Data/processed/community_issues_dataset_template.csv"
    try:
        df = pd.read_csv(file_path)
        print("Data loaded successfully.")
    except Exception as e:
        print("Error loading data:", e)
        df = None

    if df is not None:
        df = comprehensive_process_data(df)
        print(df.head())

Error loading data: [Errno 2] No such file or directory: 'Data/processed/community_issues_dataset_template.csv'


### Unsupervised Classification

In [None]:
def unsupervised_classification(texts, num_clusters=2):
    """
    Convert texts to TF-IDF vectors and cluster them using KMeans.

    Args:
        texts (list of str): The list of letter texts.
        num_clusters (int): Number of clusters to form.

    Returns:
        labels (list of int): Cluster labels for each text.
        vectorizer (TfidfVectorizer): Fitted vectorizer.
        kmeans (KMeans): Fitted KMeans model.
    """
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(texts)
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(X)
    return kmeans.labels_, vectorizer, kmeans



def label_clusters(vectorizer, kmeans, local_keywords=['problem', 'issue', 'concern', 'maintenance', 'litter', 'noise']):
    """
    Inspect cluster centroids to assign each cluster a label.
    If a cluster's top terms include any local-problem keywords, label it as "Local Problem"; otherwise, "New Initiatives".

    Args:
        vectorizer (TfidfVectorizer): The TF-IDF vectorizer.
        kmeans (KMeans): The fitted KMeans model.
        local_keywords (list): Keywords to flag a cluster as local problems.

    Returns:
        dict: Mapping from cluster index to category label.
    """
    cluster_labels = {}
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()
    for i in range(kmeans.n_clusters):
        top_terms = [terms[ind] for ind in order_centroids[i, :10]]
        # If any local keyword is among the top terms, label as "Local Problem"
        if any(keyword in top_terms for keyword in local_keywords):
            cluster_labels[i] = "Local Problem"
        else:
            cluster_labels[i] = "New Initiatives"
    return cluster_labels
