<a href="https://colab.research.google.com/github/NaviW-D/Machine-Learning/blob/main/Machine-Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
from collections import Counter

def character_histogram(text):
    """
    Calculates the frequency of each character in a given text and plots a histogram.

    Args:
        text (str): The input text.
    """
    # Remove spaces and convert to lowercase for case-insensitive counting
    text = text.replace(" ", "").lower()
    # Count the frequency of each character
    char_counts = Counter(text)

    # Sort characters alphabetically for consistent plotting
    sorted_chars = sorted(char_counts.keys())
    frequencies = [char_counts[char] for char in sorted_chars]

    # Plot the histogram
    plt.figure(figsize=(12, 6))
    plt.bar(sorted_chars, frequencies)
    plt.xlabel("Characters")
    plt.ylabel("Frequency")
    plt.title("Character Histogram")
    plt.show()

# Example usage:
sample_text = """
 پوری یدونه ر داره ولی 3 تا رم  """

character_histogram(sample_text)

In [None]:
import os
import unicodedata
import urllib.request
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt

# interactive widgets & display
try:
    import ipywidgets as widgets
    from IPython.display import display, clear_output
    WIDGETS_OK = True
except Exception:
    WIDGETS_OK = False

# Colab download helper (optional)
try:
    from google.colab import files as colab_files
    IN_COLAB = True
except Exception:
    colab_files = None
    IN_COLAB = False

# ----------------------------
# Persian alphabet (order)

PERSIAN_ALPHA = ["ا","ب","پ","ت","ث","ج","چ","ح","خ","د","ذ","ر","ز","ژ",
                 "س","ش","ص","ض","ط","ظ","ع","غ","ف","ق","ک","گ","ل",
                 "م","ن","و","ه","ی"]

def ensure_persian_font():
    """
    Try to download Bnazanin font (if not already) and register with matplotlib.
    Works in Colab if internet is available.
    """
    try:
        import matplotlib.font_manager as fm
        font_dir = '/usr/share/fonts/truetype/Bnazanin'
        font_path = os.path.join(font_dir, 'Bnazanin-Regular.ttf')
        if not os.path.exists(font_path):
            os.makedirs(font_dir, exist_ok=True)
            url = 'https://github.com/rastikerdar/Bnazanin-font/raw/master/dist/Vazir-Regular.ttf'
            urllib.request.urlretrieve(url, font_path)
            fm.fontManager.addfont(font_path)
        plt.rcParams['font.family'] = 'Bnazanin'
    except Exception:
        # If anything fails, continue with matplotlib defaults.
        pass

# ----------------------------
# Normalization functions
# ----------------------------
def remove_diacritics(s: str) -> str:
    # decompose and remove combining marks (diacritics)
    return ''.join(ch for ch in unicodedata.normalize('NFD', s) if unicodedata.category(ch) != 'Mn')

def normalize_persian(s: str) -> str:
    """
    Normalize Persian text:
    - NFC normalization
    - remove diacritics and tatweel
    - map Arabic forms to Persian forms (ي->ی, ك->ک, آ/أ/إ->ا, etc.)
    - lowercase
    """
    if not s:
        return ""
    s = unicodedata.normalize('NFC', s)
    s = s.replace('\u0640', '')  # tatweel 'ـ'
    s = remove_diacritics(s)

    # common replacements to standard Persian forms
    replacements = {
        'ي': 'ی',
        'ك': 'ک',
        'ؤ': 'و',
        'ئ': 'ی',
        'أ': 'ا',
        'إ': 'ا',
        'آ': 'ا',
        'ٱ': 'ا',
        'ة': 'ه',
        # Arabic hamzas sometimes combine; mapping to nearest Persian letter
    }
    for a, b in replacements.items():
        s = s.replace(a, b)

    s = s.lower()
    return s

# ----------------------------
# Counting & DataFrame helpers
# ----------------------------
def letters_from_text(s: str) -> list:
    """Return list of alphabetic characters after normalization."""
    s_norm = normalize_persian(s)
    return [ch for ch in s_norm if ch.isalpha()]

def counter_to_df(counter: Counter) -> pd.DataFrame:
    """Return DataFrame sorted by descending count, with percent column."""
    df = pd.DataFrame(list(counter.items()), columns=['letter','count'])
    if df.empty:
        return df
    df = df.sort_values('count', ascending=False).reset_index(drop=True)
    total = df['count'].sum()
    df['percent'] = (df['count'] / total * 100).round(2)
    return df

def persian_alphabet_df(counter: Counter) -> pd.DataFrame:
    """Return DataFrame of Persian alphabet in correct order (counts may be zero)."""
    total = sum(counter.values())
    rows = []
    for ch in PERSIAN_ALPHA:
        cnt = counter.get(ch, 0)
        pct = round(cnt / total * 100, 2) if total > 0 else 0.0
        rows.append((ch, cnt, pct))
    df = pd.DataFrame(rows, columns=['letter','count','percent'])
    return df

# ----------------------------
# Plotting
# ----------------------------
def plot_top_freq(df_all: pd.DataFrame, top_n: int = 30, fname: str = "letters_top_freq.png"):
    if df_all.empty:
        print("⚠️ هیچ حرفی برای شمارش وجود ندارد.")
        return
    df_plot = df_all.head(top_n) if top_n else df_all
    plt.figure(figsize=(12,5))
    bars = plt.bar(df_plot['letter'], df_plot['count'], color='#2b8cbe')
    plt.title(f"Top {len(df_plot)} letters by frequency", fontsize=14)
    plt.xlabel("Letter")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha='right')
    # annotate
    ymax = max(df_plot['count']) if not df_plot['count'].empty else 0
    for b, c in zip(bars, df_plot['count']):
        plt.text(b.get_x() + b.get_width()/2, c + ymax*0.01, str(int(c)), ha='center', va='bottom', fontsize=9)
    plt.tight_layout()
    plt.savefig(fname, dpi=150)
    plt.show()
    print(f"✅ Saved: {fname}")

def plot_persian_order(df_pers: pd.DataFrame, fname: str = "letters_persian_order.png"):
    plt.figure(figsize=(14,4))
    bars = plt.bar(df_pers['letter'], df_pers['count'], color='#7fc97f')
    plt.title("Persian alphabet order (counts per letter)", fontsize=14)
    plt.xlabel("حروف فارسی (به ترتیب الفبا)", fontsize=12)
    plt.ylabel("تعداد", fontsize=12)
    plt.xticks(rotation=0)
    ymax = max(df_pers['count']) if not df_pers['count'].empty else 0
    for b, c in zip(bars, df_pers['count']):
        if c>0:
            plt.text(b.get_x() + b.get_width()/2, c + ymax*0.01, str(int(c)), ha='center', va='bottom', fontsize=9)
    plt.tight_layout()
    plt.savefig(fname, dpi=150)
    plt.show()
    print(f"✅ Saved: {fname}")

# ----------------------------
# Main processing & UI
# ----------------------------
def process_text_and_show(text: str, top_n: int = 30, save_csv: bool = True):
    """
    Normalize, count, show tables and plots.
    """
    ensure_persian_font()
    text = text or ""
    letters = letters_from_text(text)
    counter = Counter(letters)
    if not counter:
        print("⚠️ پس از نرمال‌سازی هیچ حرفی برای شمارش باقی نماند.")
        return

    df_all = counter_to_df(counter)
    df_pers = persian_alphabet_df(counter)

    # display tables
    print("🔹 جدول حروف (ترتیب: فراوانی نزولی):")
    display(df_all.head(40))

    print("\n🔹 جدول حروف فارسی (ترتیب: الفبا فارسی):")
    display(df_pers)

    # plots
    plot_top_freq(df_all, top_n=top_n)
    plot_persian_order(df_pers)

    # save CSVs
    if save_csv:
        csv_all = "letter_counts_all.csv"
        csv_pers = "letter_counts_persian_order.csv"
        df_all.to_csv(csv_all, index=False, encoding='utf-8-sig')
        df_pers.to_csv(csv_pers, index=False, encoding='utf-8-sig')
        print(f"\n✅ CSVs saved: '{csv_all}', '{csv_pers}'")
        # offer download in Colab
        if IN_COLAB and colab_files is not None:
            try:
                colab_files.download(csv_all)
                colab_files.download(csv_pers)
            except Exception:
                pass

# ----------------------------
# If widgets available: show UI
# ----------------------------
if WIDGETS_OK:
    textarea = widgets.Textarea(
        value='',
        placeholder='اینجا متن فارسی یا انگلیسی را پیست یا تایپ کنید — سپس دکمهٔ "محاسبه" را بزنید.',
        description='متن:',
        layout=widgets.Layout(width='100%', height='220px'),
        style={'description_width': '60px'}
    )
    top_n_widget = widgets.BoundedIntText(value=30, min=1, max=500, step=1, description='Top N:', style={'description_width': '50px'})
    btn = widgets.Button(description='محاسبه', button_style='success', tooltip='روی این دکمه کلیک کن تا هیستوگرام تولید شود')
    output = widgets.Output()

    def on_click(b):
        with output:
            clear_output()
            txt = textarea.value
            if not txt.strip():
                print("⚠️ متنی وارد نشده — لطفاً متن را پیست یا تایپ کنید.")
                return
            process_text_and_show(txt, top_n=int(top_n_widget.value), save_csv=True)

    btn.on_click(on_click)
    display(widgets.VBox([textarea, widgets.HBox([top_n_widget, btn]), output]))
else:
    # fallback: simple input (one-line) — not ideal for multi-line
    print("توجه: ipywidgets در این محیط در دسترس نیست. لطفاً متن را در یک خط وارد کنید (پاراگراف‌های چندخطی پشتیبانی نمی‌شوند):")
    txt = input("متن: ")
    process_text_and_show(txt, top_n=30, save_csv=False)

# ==========================
# End of script
# ==========================
