# Exploratory Data Analysis (EDA)
This notebook provides an EDA of the dataset and also creates the clean and anon. data.

# Imports and Dependencies



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import sys

sys.path.append("..")
from src.data.utils import anonymize_text

In [None]:
nltk.download("stopwords")

# Load the Dataset


In [None]:
df = pd.read_csv("../PATH_TO_DATA")

# Initial Data Overview


In [None]:
print("DataFrame Shape:", df.shape)
try:
    display(df.head(10))

    print("\nDataFrame Info:")
    display(df.info())

    print("\nCheck for Missing Values:")
    display(df.isnull().sum())
except Exception as _:
    print("We are in a docker")

#  Distribution of Target Variable



In [None]:
label_counts = df["sensitive_label"].value_counts()
plt.figure(figsize=(6, 4))
sns.barplot(x=label_counts.index, y=label_counts.values, palette="Set2")
plt.title("Distribution of sensitive_label")
plt.xlabel("Label")
plt.ylabel("Count")
plt.show()

print("Label Counts:")
print(label_counts)

# Text Length Analysis

In [None]:
df["text_length"] = df["text"].apply(lambda x: len(str(x)))
plt.figure(figsize=(8, 6))
sns.histplot(df["text_length"], bins=50, kde=True)
plt.title("Distribution of Text Lengths")
plt.xlabel("Text Length")
plt.ylabel("Frequency")
plt.show()

print("Basic statistics for text length:")
print(df["text_length"].describe())

# Basic Text Cleaning for EDA



In [None]:
stop_words = set(stopwords.words("english"))


def basic_cleaning(text):
    # Not needed more data cleaning
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)
    return text.strip()


df["cleaned_text"] = df["text"].apply(basic_cleaning)
try:
    display(df[["text", "cleaned_text"]].head(5))
except Exception as _:
    print("We are in a docker")

# Word Frequency Analysis


In [None]:
def tokenize(text):
    tokens = text.split()
    tokens = [w for w in tokens if w not in stop_words]
    return tokens


if "cleaned_text" in df.columns:
    # Flatten all tokens into a single list
    all_tokens = []
    for t in df["cleaned_text"]:
        all_tokens.extend(tokenize(t))

    # Top 20 most common words
    counter = Counter(all_tokens)
    common_words = counter.most_common(20)

    # Display barplot
    words, counts = zip(*common_words)
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(words), y=list(counts), palette="Set2")
    plt.xticks(rotation=45)
    plt.title("Top 20 Most Common Words (after cleaning)")
    plt.xlabel("Words")
    plt.ylabel("Frequency")
    plt.show()

    print("Most Common Words:")
    for w, c in common_words:
        print(w, ":", c)

# Label-Wise Analysis


In [12]:
def analyze_labels(df, text_column, label_column):
    """
    Analyze text characteristics by label.

    Args:
        df: DataFrame containing the data
        text_column: Name of column containing text
        label_column: Name of column containing labels
    """
    # Split by label
    label_0_texts = df[df[label_column] == 0][text_column].tolist()
    label_1_texts = df[df[label_column] == 1][text_column].tolist()

    # Compute average text length by label
    avg_len_label_0 = np.mean([len(t.split()) for t in label_0_texts])
    avg_len_label_1 = np.mean([len(t.split()) for t in label_1_texts])

    print("Average Word Count Label 0:", avg_len_label_0)
    print("Average Word Count Label 1:", avg_len_label_1)

    # Distribution of text length by label
    fig, ax = plt.subplots(1, 2, figsize=(12, 6))

    sns.histplot(
        [len(t.split()) for t in label_0_texts], bins=30, ax=ax[0], color="skyblue"
    )
    ax[0].set_title("Text Length for Label 0")
    ax[0].set_xlabel("Word Count")

    sns.histplot(
        [len(t.split()) for t in label_1_texts], bins=30, ax=ax[1], color="salmon"
    )
    ax[1].set_title("Text Length for Label 1")
    ax[1].set_xlabel("Word Count")

    plt.show()

    # Compare word distributions
    label_0_tokens = []
    for t in label_0_texts:
        label_0_tokens.extend(tokenize(t))
    label_1_tokens = []
    for t in label_1_texts:
        label_1_tokens.extend(tokenize(t))

    label_0_freq = Counter(label_0_tokens).most_common(10)
    label_1_freq = Counter(label_1_tokens).most_common(10)

    print("\nTop 10 Words in Label 0:")
    for w, c in label_0_freq:
        print(w, ":", c)

    print("\nTop 10 Words in Label 1:")
    for w, c in label_1_freq:
        print(w, ":", c)

In [None]:
analyze_labels(df, "cleaned_text", "sensitive_label")

# Bigrams and Trigrams


In [None]:
from nltk import ngrams


def get_top_ngrams(corpus, n=2, top=10):
    """
    Returns a list of top n-grams from a corpus.
    """
    all_tokens = []
    for text in corpus:
        all_tokens.extend(tokenize(text))

    n_gram_counts = Counter(ngrams(all_tokens, n))
    return n_gram_counts.most_common(top)


# Bigrams
top_bigrams = get_top_ngrams(df["cleaned_text"], n=2, top=10)
print("Top 10 Bigrams:")
for bg, count in top_bigrams:
    print(" ".join(bg), ":", count)

# Trigrams
top_trigrams = get_top_ngrams(df["cleaned_text"], n=3, top=10)
print("\nTop 10 Trigrams:")
for tg, count in top_trigrams:
    print(" ".join(tg), ":", count)

# Analyze Similar Patterns with Different Labels

This analysis identifies and groups similar text patterns that have been labeled differently, 
which could indicate inconsistencies in the labeling process. The process:

1. Text Similarity Analysis:
   - Anonymizes texts to focus on patterns rather than specific values
   - Uses TF-IDF vectorization with n-grams (1-3) to capture text features
   - Calculates cosine similarity between texts to find similar patterns
   
2. Grouping and Analysis:
   - Groups texts with similarity above threshold (0.7)
   - Identifies groups with inconsistent labels
   - Suggests relabeling based on majority vote within each group
   
3. Output Generation:
   - Creates multiple dataset versions:
     * Relabeled: Original dataset with suggested label corrections
     * Clean: Dataset with all inconsistent groups removed
     * Anonymized versions of both
   - Generates visualizations of group sizes and label distributions
   - Exports detailed suggestions for manual review

This helps improve dataset quality by identifying potential labeling errors 
and providing options for handling inconsistencies.


In [None]:
def find_similar_groups(texts, labels, threshold=0.7):
    # Anonymize texts before comparison
    anonymized_texts = [anonymize_text(text) for text in texts]

    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=2)
    tfidf_matrix = vectorizer.fit_transform(anonymized_texts)

    # Calculate pairwise similarities
    similarities = cosine_similarity(tfidf_matrix)

    # Create groups of similar texts
    groups = []
    used_indices = set()

    for i in range(len(texts)):
        if i in used_indices:
            continue

        # Find all similar texts to the current one
        group = []
        for j in range(len(texts)):
            if similarities[i, j] >= threshold:
                group.append(
                    {
                        "index": j,
                        "text": texts[j],
                        "anonymized_text": anonymized_texts[j],
                        "label": labels[j],
                        "similarity": similarities[i, j],
                    }
                )

        # Only keep groups with more than one text
        if len(group) > 1:
            groups.append(group)
            used_indices.update(item["index"] for item in group)

    return groups


def analyze_groups(groups):
    inconsistent_groups = []

    for group in groups:
        # Count labels in the group
        label_counts = defaultdict(int)
        for item in group:
            label_counts[item["label"]] += 1

        # Only keep groups with different labels
        if len(label_counts) > 1:
            # Calculate majority label
            majority_label = max(label_counts.items(), key=lambda x: x[1])[0]

            inconsistent_groups.append(
                {
                    "texts": group,
                    "label_counts": dict(label_counts),
                    "majority_label": majority_label,
                }
            )

    return inconsistent_groups


# Apply the analysis
groups = find_similar_groups(df["cleaned_text"].values, df["sensitive_label"].values)
inconsistent_groups = analyze_groups(groups)

# Display results
print(f"Found {len(inconsistent_groups)} groups with inconsistent labels\n")

for idx, group in enumerate(inconsistent_groups, 1):
    print(f"\nGroup {idx}:")
    print(f"Label distribution: {group['label_counts']}")
    print(f"Suggested label (majority): {group['majority_label']}")
    print("\nTexts in this group:")

    for item in group["texts"]:
        print(f"\nOriginal Text (current label: {item['label']}):")
        print(f"  {item['text']}")
        print("Anonymized Text:")
        print(f"  {item['anonymized_text']}")
        if item["label"] != group["majority_label"]:
            print(f"  ** Suggested relabel to: {group['majority_label']} **")
    print("-" * 80)

# Summary statistics
total_texts_to_relabel = sum(
    sum(1 for item in group["texts"] if item["label"] != group["majority_label"])
    for group in inconsistent_groups
)

print("\nSummary:")
print(f"Total groups with inconsistencies: {len(inconsistent_groups)}")
print(f"Total texts suggested for relabeling: {total_texts_to_relabel}")

# Export suggestions to CSV if needed
relabel_suggestions = []
for group_idx, group in enumerate(inconsistent_groups, 1):
    for item in group["texts"]:
        if item["label"] != group["majority_label"]:
            relabel_suggestions.append(
                {
                    "group_id": group_idx,
                    "text": item["text"],
                    "current_label": item["label"],
                    "suggested_label": group["majority_label"],
                    "group_size": len(group["texts"]),
                    "majority_percentage": group["label_counts"][
                        group["majority_label"]
                    ]
                    / len(group["texts"]),
                }
            )

if relabel_suggestions:
    suggestions_df = pd.DataFrame(relabel_suggestions)

    # Create relabeled version (keeping only text and sensitive_label columns)
    df_relabeled = df[["text", "sensitive_label"]].copy()
    for _, row in suggestions_df.iterrows():
        df_relabeled.loc[df_relabeled["text"] == row["text"], "sensitive_label"] = row[
            "suggested_label"
        ]

    # Create clean version (removing all rows involved in inconsistencies)
    rows_to_remove = set()
    for group in inconsistent_groups:
        rows_to_remove.update(item["text"] for item in group["texts"])
    df_clean = df[~df["text"].isin(rows_to_remove)][["text", "sensitive_label"]]

    # Create anonymized versions
    df_relabeled_anon = df_relabeled.copy()
    df_relabeled_anon["text"] = df_relabeled_anon["text"].apply(anonymize_text)

    df_clean_anon = df_clean.copy()
    df_clean_anon["text"] = df_clean_anon["text"].apply(anonymize_text)

    # Save all versions
    print(
        f"\nSaving relabeled dataset ({len(df_relabeled)} rows) to '../PATH_TO_DATA/train_relabelled.csv'"
    )
    df_relabeled.to_csv("../PATH_TO_DATA/train_relabelled.csv", index=False)

    print(
        "Saving anonymized relabeled dataset to '../PATH_TO_DATA/train_relabelled_anon.csv'"
    )
    df_relabeled_anon.to_csv(
        "../PATH_TO_DATA/train_relabelled_anon.csv", index=False
    )

    print(
        f"\nSaving clean dataset ({len(df_clean)} rows) to '../PATH_TO_DATA/train_clean.csv'"
    )
    df_clean.to_csv("../PATH_TO_DATA/train_clean.csv", index=False)

    print(
        "Saving anonymized clean dataset to '../PATH_TO_DATA/train_clean_anon.csv'"
    )
    df_clean_anon.to_csv("../PATH_TO_DATA/train_clean_anon.csv", index=False)

    # Print statistics
    print("\nDataset Statistics:")
    print(f"Original dataset size: {len(df)}")
    print(f"Number of relabeled rows: {len(suggestions_df)}")
    print(f"Number of rows removed in clean version: {len(rows_to_remove)}")
    print(f"Clean dataset size: {len(df_clean)}")

    # Show label distribution for each version
    print("\nLabel Distribution:")
    print("\nOriginal:")
    print(df["sensitive_label"].value_counts(normalize=True))
    print("\nRelabeled:")
    print(df_relabeled["sensitive_label"].value_counts(normalize=True))
    print("\nClean:")
    print(df_clean["sensitive_label"].value_counts(normalize=True))

    # Show example of anonymization
    print("\nAnonymization Example (first 3 rows):")
    comparison = pd.DataFrame(
        {
            "Original": df_relabeled["text"].head(3),
            "Anonymized": df_relabeled_anon["text"].head(3),
        }
    )

In [None]:
# Load validation dataset
val_df = pd.read_csv("../PATH_TO_DATA")

# Create anonymized version
val_df_anon = val_df.copy()
val_df_anon["text"] = val_df_anon["text"].apply(anonymize_text)

# Save anonymized version
print(
    f"Saving anonymized validation dataset ({len(val_df_anon)} rows) to '../PATH_TO_DATA/validation_anon.csv'"
)
val_df_anon.to_csv("../PATH_TO_DATA/validation_anon.csv", index=False)

# Print statistics
print("\nValidation Dataset Statistics:")
print(f"Dataset size: {len(val_df)}")

# Show label distribution
print("\nLabel Distribution:")
print(val_df["sensitive_label"].value_counts(normalize=True))

# Show example of anonymization
print("\nAnonymization Example (first 3 rows):")
comparison = pd.DataFrame(
    {"Original": val_df["text"].head(3), "Anonymized": val_df_anon["text"].head(3)}
)
try:
    display(comparison)
except Exception as _:
    print("We are in a docker")

# New data distribution in train df after relabel

In [None]:
label_counts = df_relabeled_anon["sensitive_label"].value_counts()
plt.figure(figsize=(6, 4))
sns.barplot(x=label_counts.index, y=label_counts.values, palette="Set2")
plt.title("Distribution of sensitive_label in relabeled dataset")
plt.xlabel("Label")
plt.ylabel("Count")
plt.show()

print("Label Counts:")
print(label_counts)

In [None]:
analyze_labels(df, "cleaned_text", "sensitive_label")

# Conclusions





1. Dataset Overview and Label Distribution
* The dataset consists of 680 entries with two columns: text (containing various log messages) and sensitive_label (indicating the presence of sensitive information).
* The labels are somewhat imbalanced, with 440 (64.7%) non-sensitive and 240 (35.3%) sensitive records.
* The dataset was noisy, meaning that some labels may not be entirely accurate.

2. Text Length and Characteristics
* The average text length is 101.68 characters, with a standard deviation of 92.3.
* Messages labeled as sensitive (sensitive_label=1) tend to be longer (15.95 words on average) compared to non-sensitive ones (10.83 words).
* Sensitive texts often contain specific terms related to security incidents, credentials, and personally identifiable information.

3. Common Words and Patterns
* Most common words include User, access, log, policy, SSN, and password.
* Top bigrams include "requested access", "password reset", and "account credentials", suggesting a focus on access control.
* Top trigrams like "Service account credentials", "accidentally shared SSN", and "public Slack channel" indicate frequent sensitive data exposures.

4. Labeling Inconsistencies
* Multiple text entries with nearly identical structure have different labels (e.g., “User X requested access” is sometimes labeled as sensitive and other times not).
* A similarity analysis using TF-IDF and cosine similarity identified 56 groups of inconsistently labeled texts.
* Approximately 96 texts were suggested for relabeling.

5. Anonymize Function
* The anonymize_text function was applied to the dataset to remove personally identifiable information.
* It replaces:
    * Names with [Name]
    * Emails and domains with [DOMAIN]
    * Dates with [DATE]
    * Social security numbers with [SSN]
    * Passwords and credentials with [PASSWORD]
    * Numeric sequences (e.g., credit card numbers) with [NUMBER]
* The function is useful for preserving privacy while allowing text pattern analysis.
* However, it may not always correctly identify names if they appear at the beginning of a sentence or in uncommon formats.
