In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import os
import cv2
import numpy as np


# Load CSV File with Labels

csv_file = "pics_labels.csv"
df = pd.read_csv(csv_file)

if 'filename' not in df.columns or 'label' not in df.columns:
    print("CSV file must contain 'filename' and 'label' columns.")
    exit()

print("Dataset Summary:")
print(df.info())
print("\nAll Data:")
print(df.to_string(index=False))


# Kudlit Detection Section

def detect_kudlits(image_path):
    img = cv2.imread(image_path)
    if img is None:
        return {"top_kudlit": 0, "bottom_kudlit": 0, "cross_under": 0}

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    height = img.shape[0]
    top_kudlit = 0
    bottom_kudlit = 0
    cross_under = 0

    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        area = w * h

        if area < 150:
            center_y = y + h // 2
            if center_y < height * 0.3:
                top_kudlit += 1
            elif center_y > height * 0.7:
                bottom_kudlit += 1
                aspect_ratio = w / float(h)
                if 0.8 < aspect_ratio < 1.2:
                    cross_under += 1

    return {
        "top_kudlit": top_kudlit,
        "bottom_kudlit": bottom_kudlit,
        "cross_under": cross_under
    }

# Process all images
image_folder = "Dataset"
kudlit_data = []

for _, row in df.iterrows():
    image_path = os.path.join(image_folder, row['filename'])
    kudlit_info = detect_kudlits(image_path)
    kudlit_data.append(kudlit_info)

# Merge kudlit info
df = pd.concat([df, pd.DataFrame(kudlit_data)], axis=1)

print("\nData with kudlit detection:")
print(df.to_string(index=False))


# 2. Separate Word Frequency Distribution (Horizontal Bar Plot for Single and Double Letters)

# Split the words into double-lettered (with '_') and single-lettered words
double_letters = df['label'].str.contains('_')
single_letters = ~double_letters

# Separate the data
double_letter_counts = df[double_letters]['label'].value_counts().reset_index()
single_letter_counts = df[single_letters]['label'].value_counts().reset_index()

# Rename columns for clarity
double_letter_counts.columns = ['label', 'count']
single_letter_counts.columns = ['label', 'count']

# Sort both datasets by count for better visualization
double_letter_counts = double_letter_counts.sort_values(by='count', ascending=True)
single_letter_counts = single_letter_counts.sort_values(by='count', ascending=True)


# 2.1. Plot for Double Letter Words

plt.figure(figsize=(12, 8))
sns.barplot(data=double_letter_counts, y='label', x='count', palette='viridis')

plt.title("Baybayin Double Letter Word Frequency", fontsize=16)
plt.xlabel("Count", fontsize=12)
plt.ylabel("Double Letter Word", fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# 2.2. Plot for Single Letter Words

plt.figure(figsize=(12, 8))
sns.barplot(data=single_letter_counts, y='label', x='count', palette='viridis')

plt.title("Baybayin Single Letter Word Frequency", fontsize=16)
plt.xlabel("Count", fontsize=12)
plt.ylabel("Single Letter Word", fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


# 3. Baybayin Character Frequency (Grouped and Alphabetical)

# Define character mapping based on the chart
baybayin_letters = [
    'a', 'ba', 'ka', 'da', 'ga', 'ha', 'la', 'ma', 'na', 'nga', 'pa', 'sa', 'ta', 'wa', 'ya',
    'e_i', 'be', 'ke', 'de', 'ge', 'he', 'le', 'me', 'ne', 'nge', 'pe', 'se', 'te', 'we', 'ye',
    'o_u', 'bo', 'ko', 'do', 'go', 'ho', 'lo', 'mo', 'no', 'ngo', 'po', 'so', 'to', 'wo', 'yo',
    'b', 'k', 'd', 'g', 'l', 'm', 'n', 'ng', 'p', 's', 't', 'w', 'y'
]

# Flatten and normalize
labels = df['label'].str.replace('-', '').str.split('_').explode().dropna()
def normalize_baybayin_char(char):
    if char in ['e', 'i']:
        return 'e_i'
    elif char in ['o', 'u']:
        return 'o_u'
    return char

normalized = labels.apply(normalize_baybayin_char)
char_counts = normalized.value_counts().to_dict()
char_counts_complete = {char: char_counts.get(char, 0) for char in baybayin_letters}
char_df = pd.DataFrame(list(char_counts_complete.items()), columns=['character', 'count'])

plt.figure(figsize=(16, 8))
sns.barplot(data=char_df, x='character', y='count', palette='crest')

plt.title("Baybayin Character Frequency (Grouped & Alphabetical)", fontsize=16)
plt.xlabel("Character", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.suptitle(f"Total Characters: {char_df['count'].sum()}", fontsize=12, y=1.02)

plt.tight_layout()
plt.show()

# 4. Correlation Heatmap (if image size is available)

if 'image_width' in df.columns and 'image_height' in df.columns:
    corr_matrix = df[['image_width', 'image_height']].corr()
    plt.figure(figsize=(6, 4))
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", linewidths=0.5)
    plt.title("Correlation Heatmap")
    plt.show()
else:
    print("No image dimensions found in the CSV. Skipping the correlation heatmap.")

# 5. Kudlit Type Summary (Strict Baybayin Rules - Enhanced)

# Valid sets
top_vowel_suffixes = ['e', 'i']
bottom_vowel_suffixes = ['o', 'u']
cross_consonants = ['b', 'k', 'd', 'g', 'l', 'm', 'n', 'ng', 's', 't', 'w', 'y']
no_kudlit_valid = ['a', 'ba', 'ka', 'da', 'ga', 'ha', 'la', 'ma', 'na', 'nga', 'pa', 'sa', 'ta', 'wa', 'ya']

# Dynamically add compound labels that have no diacritics
compound_no_kudlit = df[
    (df['top_kudlit'] == 0) &
    (df['bottom_kudlit'] == 0) &
    (df['cross_under'] == 0)
]['label'].str.replace('-', '_').unique().tolist()

# Combine hardcoded and dynamic no-kudlit-valid list
no_kudlit_valid = set(no_kudlit_valid) | set(compound_no_kudlit)

# Counters
valid_top_kudlit = 0
valid_bottom_kudlit = 0
valid_cross_kudlit = 0
vowel_single_letter_total = 0
no_kudlit_total = 0

unmatched_rows = []

for _, row in df.iterrows():
    label = row['label'].replace('-', '_')  # preserve compound syllables
    parts = label.split('_')

    matched = False

    # Check for valid top kudlit (e/i only)
    for part in parts:
        if any(part.endswith(v) for v in top_vowel_suffixes):
            if row['top_kudlit'] > 0:
                valid_top_kudlit += 1
                matched = True
                break

    # Check for valid bottom kudlit (o/u only)
    for part in parts:
        if any(part.endswith(v) for v in bottom_vowel_suffixes):
            if row['bottom_kudlit'] > 0:
                valid_bottom_kudlit += 1
                matched = True
                break

    # Check for valid cross under consonants
    for part in parts:
        if part in cross_consonants and row['cross_under'] > 0:
            valid_cross_kudlit += 1
            matched = True
            break

    # Count single-letter vowel characters
    if len(parts) == 1 and parts[0] in ['a', 'e', 'i', 'o', 'u', 'o_u', 'e_i']:
        vowel_single_letter_total += 1
        matched = True

    # Check for no kudlit (valid base characters only)
    if row['top_kudlit'] == 0 and row['bottom_kudlit'] == 0 and row['cross_under'] == 0:
        if all(part in no_kudlit_valid for part in parts):
            no_kudlit_total += 1
            matched = True

    # Track unmatched rows
    if not matched:
        unmatched_rows.append(row)

# ✅ Summary
kudlit_summary_filtered = pd.DataFrame({
    "Type": [
        "Top Kudlit (Valid e/i)",
        "Bottom Kudlit (Valid o/u)",
        "Cross Kudlit (b–y only)",
        "Vowel Characters (Single-letter)",
        "No Kudlit (a, ba, ka...)"
    ],
    "Count": [
        valid_top_kudlit,
        valid_bottom_kudlit,
        valid_cross_kudlit,
        vowel_single_letter_total,
        no_kudlit_total
    ]
})

print("\n✅ Kudlit Summary (Strict Filter with Enhancements):")
print(kudlit_summary_filtered)

# ⚠️ Unmatched analysis
unmatched_df = pd.DataFrame(unmatched_rows)
total_unmatched = len(unmatched_df)
print(f"\n⚠️ Total Unmatched Entries: {total_unmatched}")

if total_unmatched > 0:
    print("\n🧩 Top 20 Unmatched Labels:")
    print(unmatched_df['label'].value_counts().head(20))

    print("\n🔧 Kudlit Summary in Unmatched Entries:")
    print(unmatched_df[['top_kudlit', 'bottom_kudlit', 'cross_under']].sum())

# 📊 Plot
plt.figure(figsize=(8, 6))
sns.barplot(data=kudlit_summary_filtered, x="Count", y="Type", palette="rocket")

for i, row in kudlit_summary_filtered.iterrows():
    plt.text(row['Count'] + 0.5, i, str(row['Count']), va='center', fontsize=10)

plt.title("Filtered Kudlit and Vowel Counts (Strict Rules)", fontsize=16)
plt.xlabel("Total Count", fontsize=12)
plt.ylabel("Type", fontsize=12)
plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
