<a href="https://colab.research.google.com/github/GhazalehKeyvani/Data-Science-Exercises/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Minimom Liklihood Distance**

In [1]:
def levenshtein(s, t):
    m, n = len(s), len(t)
    D = [[0]*(n+1) for _ in range(m+1)]
    for i in range(m+1):
        D[i][0] = i
    for j in range(n+1):
        D[0][j] = j
    for i in range(1, m+1):
        for j in range(1, n+1):
            cost = 0 if s[i-1] == t[j-1] else 1
            D[i][j] = min(
                D[i-1][j] + 1,      # deletion
                D[i][j-1] + 1,      # insertion
                D[i-1][j-1] + cost  # substitution/match
            )
    return D[m][n]


In [2]:
levenshtein("leda","deal") == 3
levenshtein("drive","brief") == 3
levenshtein("drive","divers") == 3

True

In [3]:
def levenshtein_with_alignment(s, t):
    m, n = len(s), len(t)
    D = [[0]*(n+1) for _ in range(m+1)]
    ptr = [[None]*(n+1) for _ in range(m+1)]  # '↖' diag, '↑' del, '←' ins

    for i in range(m+1):
        D[i][0] = i
        ptr[i][0] = '↑' if i else None
    for j in range(n+1):
        D[0][j] = j
        ptr[0][j] = '←' if j else None

    for i in range(1, m+1):
        for j in range(1, n+1):
            cost = 0 if s[i-1] == t[j-1] else 1
            choices = [
                (D[i-1][j] + 1, '↑'),         # deletion
                (D[i][j-1] + 1, '←'),         # insertion
                (D[i-1][j-1] + cost, '↖')     # substitution/match
            ]
            D[i][j], ptr[i][j] = min(choices, key=lambda x: x[0])

    # Backtrace
    i, j = m, n
    a_s, a_t, ops = [], [], []
    while i > 0 or j > 0:
        p = ptr[i][j]
        if p == '↖':
            a_s.append(s[i-1]); a_t.append(t[j-1])
            ops.append(' ' if s[i-1] == t[j-1] else '*')
            i -= 1; j -= 1
        elif p == '↑':
            a_s.append(s[i-1]); a_t.append('-')
            ops.append('D'); i -= 1
        elif p == '←':
            a_s.append('-'); a_t.append(t[j-1])
            ops.append('I'); j -= 1

    return D[m][n], ''.join(reversed(a_s)), ''.join(reversed(ops)), ''.join(reversed(a_t))

# Example:
dist, s_aln, op, t_aln = levenshtein_with_alignment("leda","deal")
print(dist)     # 3
print(s_aln)    # leda
print(op)       # * * *
print(t_aln)    # deal


3
leda-
* D I
de-al


In [4]:
from collections import defaultdict

def count_ngrams(corpus):
    unigram_counts = defaultdict(int)
    bigram_counts = defaultdict(int)

    for sentence in corpus:
        tokens = ['<s>'] + sentence + ['</s>']  # include start and end tokens
        for i in range(len(tokens)):
            unigram_counts[tokens[i]] += 1
            if i < len(tokens) - 1:
                bigram = (tokens[i], tokens[i+1])
                bigram_counts[bigram] += 1

    return unigram_counts, bigram_counts

# Example corpus: list of tokenized sentences
corpus = [
    ['I', 'am', 'Sam'],
    ['Sam', 'I', 'am'],
    ['I', 'am', 'Sam'],
    ['I', 'do', 'not', 'like', 'green', 'eggs', 'and', 'Sam']
]

# Count n-grams
unigrams, bigrams = count_ngrams(corpus)

# Display results
print("Unigram Counts:")
for word, count in unigrams.items():
    print(f"{word}: {count}")

print("\nBigram Counts:")
for bigram, count in bigrams.items():
    print(f"{bigram}: {count}")


Unigram Counts:
<s>: 4
I: 4
am: 3
Sam: 4
</s>: 4
do: 1
not: 1
like: 1
green: 1
eggs: 1
and: 1

Bigram Counts:
('<s>', 'I'): 3
('I', 'am'): 3
('am', 'Sam'): 2
('Sam', '</s>'): 3
('<s>', 'Sam'): 1
('Sam', 'I'): 1
('am', '</s>'): 1
('I', 'do'): 1
('do', 'not'): 1
('not', 'like'): 1
('like', 'green'): 1
('green', 'eggs'): 1
('eggs', 'and'): 1
('and', 'Sam'): 1


In [5]:
corpus_email = [
    ["Hi", "John", ",", "I", "hope", "you're", "doing", "well", "."],
    ["Please", "find", "attached", "the", "report", "for", "last", "week", "."],
    ["Let", "me", "know", "if", "you", "have", "any", "questions", "."]
]


# Count n-grams
unigrams, bigrams = count_ngrams(corpus_email)

# Display results
print("Unigram Counts:")
for word, count in unigrams.items():
    print(f"{word}: {count}")

print("\nBigram Counts:")
for bigram, count in bigrams.items():
    print(f"{bigram}: {count}")

Unigram Counts:
<s>: 3
Hi: 1
John: 1
,: 1
I: 1
hope: 1
you're: 1
doing: 1
well: 1
.: 3
</s>: 3
Please: 1
find: 1
attached: 1
the: 1
report: 1
for: 1
last: 1
week: 1
Let: 1
me: 1
know: 1
if: 1
you: 1
have: 1
any: 1
questions: 1

Bigram Counts:
('<s>', 'Hi'): 1
('Hi', 'John'): 1
('John', ','): 1
(',', 'I'): 1
('I', 'hope'): 1
('hope', "you're"): 1
("you're", 'doing'): 1
('doing', 'well'): 1
('well', '.'): 1
('.', '</s>'): 3
('<s>', 'Please'): 1
('Please', 'find'): 1
('find', 'attached'): 1
('attached', 'the'): 1
('the', 'report'): 1
('report', 'for'): 1
('for', 'last'): 1
('last', 'week'): 1
('week', '.'): 1
('<s>', 'Let'): 1
('Let', 'me'): 1
('me', 'know'): 1
('know', 'if'): 1
('if', 'you'): 1
('you', 'have'): 1
('have', 'any'): 1
('any', 'questions'): 1
('questions', '.'): 1


In [6]:
corpus_news = [
    ["The", "government", "announced", "new", "policies", "on", "climate", "change", "."],
    ["Experts", "say", "the", "impact", "will", "be", "significant", "."],
    ["Public", "reaction", "has", "been", "mixed", "so", "far", "."]
]


# Count n-grams
unigrams, bigrams = count_ngrams(corpus_news)

# Display results
print("Unigram Counts:")
for word, count in unigrams.items():
    print(f"{word}: {count}")

print("\nBigram Counts:")
for bigram, count in bigrams.items():
    print(f"{bigram}: {count}")

Unigram Counts:
<s>: 3
The: 1
government: 1
announced: 1
new: 1
policies: 1
on: 1
climate: 1
change: 1
.: 3
</s>: 3
Experts: 1
say: 1
the: 1
impact: 1
will: 1
be: 1
significant: 1
Public: 1
reaction: 1
has: 1
been: 1
mixed: 1
so: 1
far: 1

Bigram Counts:
('<s>', 'The'): 1
('The', 'government'): 1
('government', 'announced'): 1
('announced', 'new'): 1
('new', 'policies'): 1
('policies', 'on'): 1
('on', 'climate'): 1
('climate', 'change'): 1
('change', '.'): 1
('.', '</s>'): 3
('<s>', 'Experts'): 1
('Experts', 'say'): 1
('say', 'the'): 1
('the', 'impact'): 1
('impact', 'will'): 1
('will', 'be'): 1
('be', 'significant'): 1
('significant', '.'): 1
('<s>', 'Public'): 1
('Public', 'reaction'): 1
('reaction', 'has'): 1
('has', 'been'): 1
('been', 'mixed'): 1
('mixed', 'so'): 1
('so', 'far'): 1
('far', '.'): 1


ایمیل‌ها پر از ضمایر شخصی مثل "I" و "you" هستن.

متن خبری بیشتر شامل اسم‌های رسمی و موضوعی مثل "government"، "policies"، "climate" هست.

بی‌گرام‌های ایمیل بیشتر حالت دستوری و تعاملی دارن: "Please find", "Let me", "you have".

بی‌گرام‌های خبری بیشتر ترکیب‌های موضوعی هستن: "climate change", "impact will", "Public reaction".

# **ELIZIA Chatbot**

In [7]:
import re
import random

# ---------- reflection ----------
REFLECTIONS = {
    "I": "YOU", "ME": "YOU", "MY": "YOUR", "MINE": "YOURS",
    "AM": "ARE", "I'M": "YOU ARE",
    "YOU": "I", "YOUR": "MY", "YOURS": "MINE", "ARE": "AM"
}

_reflex_pattern = re.compile(r"\b(I'M|I|ME|MY|MINE|YOU|YOUR|YOURS|AM|ARE)\b")

def reflect(text: str) -> str:
    def _swap(m):
        tok = m.group(1)
        return REFLECTIONS.get(tok, tok)
    return _reflex_pattern.sub(_swap, text)

# ---------- rule engine ----------
class Rule:
    def __init__(self, pattern, responses, reflect_groups=None):
        self.pattern = re.compile(pattern)
        self.responses = responses
        self.reflect_groups = set(reflect_groups or [])

    def try_apply(self, text: str):
        m = self.pattern.match(text)
        if not m:
            return None
        # fill numbered placeholders {1}, {2}, ...
        def fill(template: str) -> str:
            out = template
            for i in range(1, len(m.groups()) + 1):
                g = m.group(i) or ""
                g = g.strip()
                if i in self.reflect_groups:
                    g = reflect(g)
                out = out.replace(f"{{{i}}}", g)
            return out
        return fill(random.choice(self.responses))

# ---------- rule sets ----------
ROGERIAN_RULES = [
    # direct echoes with sentiment
    Rule(r".*\bYOU ARE (DEPRESSED|SAD)\b.*", [
        "I AM SORRY TO HEAR YOU ARE {1}",
        "WHY DO YOU THINK YOU ARE {1}"
    ]),
    Rule(r".*\bI AM (.*)", [
        "HOW LONG HAVE YOU BEEN {1}?",
        "WHY ARE YOU {1}?"
    ], reflect_groups={1}),
    Rule(r".*\bI FEEL (.*)", [
        "DO YOU OFTEN FEEL {1}?",
        "WHAT MAKES YOU FEEL {1}?"
    ], reflect_groups={1}),
    Rule(r".*\bI WANT (?:TO )?(.*)", [
        "WHAT WOULD IT MEAN IF YOU GOT {1}?",
        "WHY DO YOU WANT {1}?"
    ], reflect_groups={1}),
    Rule(r".*\bMY (.*)", [
        "TELL ME MORE ABOUT YOUR {1}.",
        "HOW DOES YOUR {1} MAKE YOU FEEL?"
    ], reflect_groups={1}),
    Rule(r".*\bALWAYS\b.*", ["CAN YOU THINK OF A SPECIFIC EXAMPLE?"]),
    Rule(r".*\bALL\b.*", ["IN WHAT WAY?"]),
]

TECH_SUPPORT_RULES = [
    Rule(r".*\bERROR (\d+)\b.*", [
        "I SEE ERROR {1}. WHEN DID THIS START?",
        "ERROR {1} OCCURRED. DID YOU CHANGE ANY SETTINGS RECENTLY?"
    ]),
    Rule(r".*\bDOES NOT WORK\b.*", [
        "WHAT EXACTLY DOES NOT WORK?",
        "DO YOU SEE ANY ERROR MESSAGE?"
    ]),
    Rule(r".*\b(CRASH|FREEZE|HANGS)\b.*", [
        "HOW OFTEN DOES IT {1}?",
        "WHAT WERE YOU DOING WHEN IT STARTED TO {1}?"
    ]),
    Rule(r".*\bINSTALL(ING|ATION)?\b.*", [
        "ARE YOU INSTALLING WITH ADMIN RIGHTS?",
        "WHICH VERSION ARE YOU TRYING TO INSTALL?"
    ]),
    Rule(r".*\bNETWORK\b.*", [
        "ARE YOU ON WIFI OR ETHERNET?",
        "CAN YOU RESTART YOUR ROUTER AND TRY AGAIN?"
    ]),
]

DEFAULTS = [
    "PLEASE GO ON.",
    "TELL ME MORE.",
    "CAN YOU ELABORATE?",
    "WHY DO YOU SAY THAT?"
]

DOMAINS = {
    "rogerian": ROGERIAN_RULES,
    "tech": TECH_SUPPORT_RULES
}

def preprocess(user_text: str) -> str:
    # Trim and uppercase; keep punctuation as-is.
    return user_text.strip().upper()

def eliza_reply(user_text: str, domain="rogerian") -> str:
    if not user_text or user_text.isspace():
        return "HELLO. HOW ARE YOU FEELING TODAY?"
    text = preprocess(user_text)

    # First, do global pronoun substitutions for later echoes where needed
    # (We will also reflect specific capture groups when rules request it.)
    # Note: Many classic ELIZA implementations echo raw or reflected spans selectively.
    rules = DOMAINS.get(domain, ROGERIAN_RULES)
    for rule in rules:
        resp = rule.try_apply(text)
        if resp:
            return resp
    return random.choice(DEFAULTS)

# ---- quick RE examples for backreferences (not used directly above) ----
# pattern_backref = re.compile(r"the (.*)er they were, the \1er they will be")
# pattern_noncapture = re.compile(r"(?:SOME|A FEW) (PEOPLE|CATS) LIKE SOME \1")


In [8]:
def main():
    print("ELIZA: HELLO. HOW ARE YOU FEELING TODAY?")
    domain = input("Choose domain (rogerian/tech): ").strip().lower()
    if domain not in ["rogerian", "tech"]:
        print("Using default domain: rogerian")
        domain = "rogerian"

    while True:
        user_input = input("YOU: ")
        if user_input.lower() in ["quit", "exit", "bye"]:
            print("ELIZA: GOODBYE. TAKE CARE!")
            break
        response = eliza_reply(user_input, domain=domain)
        print(f"ELIZA: {response}")


In [9]:
main()

ELIZA: HELLO. HOW ARE YOU FEELING TODAY?
Choose domain (rogerian/tech): i like speek with you
Using default domain: rogerian
YOU: no
ELIZA: TELL ME MORE.
YOU: always i want speek with you
ELIZA: WHY DO YOU WANT SPEEK WITH I?
YOU: about my problems
ELIZA: HOW DOES YOUR PROBLEMS MAKE YOU FEEL?
YOU: nothing
ELIZA: WHY DO YOU SAY THAT?
YOU: exit
ELIZA: GOODBYE. TAKE CARE!
