# Lesson L1 (Week 2) – Visualising Naïve Bayes Likelihoods 📈🔵🔴
Welcome to **Week 2, Lesson L1**!

### What’s new vs. the original Coursera lab
* We **generate the features on‑the‑fly** instead of loading `bayes_features.csv` so you don’t need any external files.
* Function and variable names are kept **compatible with the Coursera lecture** – so you can still follow along there.
* Added a toy walk‑through and an interactive Gradio explorer.

_If you’re looking at the Coursera notebook you’ll see a CSV – here we recreate the same two columns programmatically._

In [None]:
# 🍀 Setup
!pip -q install --upgrade nltk wordcloud gradio>=4.27.0 numpy>=1.26,<2.1 scikit-learn<1.7 websockets>=13,<15 --progress-bar off
import nltk, ssl, warnings; warnings.filterwarnings('ignore')
try:
    ssl._create_default_https_context = ssl._create_unverified_context
except AttributeError:
    pass
for res in ['stopwords','punkt','twitter_samples']:
    nltk.download(res, quiet=True)
print('✅ Environment ready')

## 1️⃣ Toy example – six tiny tweets

In [None]:
import numpy as np, re, matplotlib.pyplot as plt
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stemmer, stop_words = PorterStemmer(), set(stopwords.words('english'))

def clean(t):
    t = t.lower(); t = re.sub(r'[^a-z\s]', '', t)
    return [stemmer.stem(w) for w in t.split() if w not in stop_words]

t_pos = ["love it", "so happy", "great day"]
t_neg = ["hate it", "so sad", "bad day"]
tweets = t_pos + t_neg
ys = np.array([1]*3 + [0]*3)

from collections import Counter
def word_likelihoods(pos, neg, alpha=1):
    pos_counts = Counter(w for tw in pos for w in clean(tw))
    neg_counts = Counter(w for tw in neg for w in clean(tw))
    vocab = set(pos_counts)|set(neg_counts); V=len(vocab)
    total_pos=sum(pos_counts.values())+alpha*V
    total_neg=sum(neg_counts.values())+alpha*V
    lp={w:np.log((pos_counts[w]+alpha)/total_pos) for w in vocab}
    ln={w:np.log((neg_counts[w]+alpha)/total_neg) for w in vocab}
    return lp, ln

lp, ln = word_likelihoods(t_pos, t_neg)
def tweet_ll(t):
    toks=clean(t)
    lpos=sum(lp.get(w,0) for w in toks)
    lneg=sum(ln.get(w,0) for w in toks)
    return lpos, lneg
XY=np.array([tweet_ll(t) for t in tweets])
plt.figure(figsize=(4,4))
plt.scatter(XY[:3,0],XY[:3,1],c='green',label='pos')
plt.scatter(XY[3:,0],XY[3:,1],c='red',label='neg')
plt.axline((0,0),(1,1),ls='--',c='grey');
plt.xlabel('log P(t|pos)');plt.ylabel('log P(t|neg)');plt.title('Toy likelihood space');plt.legend();plt.show()

The dashed line indicates where **log P(t∣pos) = log P(t∣neg)**.

## 2️⃣ Helper functions for full corpus

In [None]:
import re, numpy as np
from nltk.corpus import twitter_samples
from collections import Counter
stemmer, stop_words = PorterStemmer(), set(stopwords.words('english'))

def process_tweet(t):
    t=t.lower(); t=re.sub(r'https?://\S+','',t); t=re.sub(r'[^a-z\s]','',t)
    return [stemmer.stem(w) for w in t.split() if w not in stop_words]

def build_likelihoods(tweets, ys, alpha=1):
    pos=[tw for tw,y in zip(tweets,ys) if y==1]
    neg=[tw for tw,y in zip(tweets,ys) if y==0]
    pos_c=Counter(w for tw in pos for w in process_tweet(tw))
    neg_c=Counter(w for tw in neg for w in process_tweet(tw))
    vocab=set(pos_c)|set(neg_c); V=len(vocab)
    tot_p=sum(pos_c.values())+alpha*V; tot_n=sum(neg_c.values())+alpha*V
    lp={w:np.log((pos_c[w]+alpha)/tot_p) for w in vocab}
    ln={w:np.log((neg_c[w]+alpha)/tot_n) for w in vocab}
    return lp, ln

## 3️⃣ Real corpus likelihood scatter & confidence ellipses

In [None]:
tweets_pos = twitter_samples.strings('positive_tweets.json')
tweets_neg = twitter_samples.strings('negative_tweets.json')
tweets = tweets_pos + tweets_neg; ys = np.array([1]*len(tweets_pos)+[0]*len(tweets_neg))
lp, ln = build_likelihoods(tweets, ys)
def tweet_ll_real(t):
    toks=process_tweet(t)
    return sum(lp.get(w,0) for w in toks), sum(ln.get(w,0) for w in toks)
XY_full = np.array([tweet_ll_real(t) for t in tweets])
np.random.seed(0)
idx=np.random.choice(len(tweets),2000,replace=False); XY=XY_full[idx]; y_s=ys[idx]
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
fig,ax=plt.subplots(figsize=(6,6))
ax.scatter(XY[y_s==1,0],XY[y_s==1,1],s=10,c='limegreen',label='pos',alpha=0.5)
ax.scatter(XY[y_s==0,0],XY[y_s==0,1],s=10,c='crimson',label='neg',alpha=0.5)
ax.axline((0,0),(1,1),ls='--',c='grey')
def ellipse(data,color):
    import numpy.linalg as LA
    cov=np.cov(data,rowvar=False); mean=data.mean(axis=0)
    vals,vecs=LA.eigh(cov); order=vals.argsort()[::-1]; vals,vecs=vals[order],vecs[:,order]
    theta=np.degrees(np.arctan2(*vecs[:,0][::-1]))
    width,height=2*np.sqrt(vals*5.991)  # 95% CI
    ell=Ellipse(xy=mean,width=width,height=height,angle=theta,edgecolor=color,facecolor='none',lw=2)
    ax.add_patch(ell)
ellipse(XY[y_s==1],'green'); ellipse(XY[y_s==0],'red')
ax.set_xlabel('log P(t|pos)'); ax.set_ylabel('log P(t|neg)'); ax.set_title('Likelihood space (2 k sample)'); ax.legend(); plt.show()

## 4️⃣ Interactive likelihood explorer

In [None]:
import gradio as gr
def explorer(txt):
    lpv,lnv=tweet_ll_real(txt)
    prob=1/(1+np.exp(lnv-lpv))
    return {"log P(t|pos)":round(lpv,3),"log P(t|neg)":round(lnv,3),"Posterior P(pos)":round(prob,3),"Prediction":"Positive 😊" if prob>=0.5 else "Negative 😞"}
with gr.Blocks() as demo:
    gr.Markdown('### 🕵️ Likelihood explorer (Naïve Bayes)')
    inp=gr.Textbox(lines=3,label='Tweet text'); out=gr.JSON()
    inp.submit(explorer,inp,out); gr.Button('Run').click(explorer,inp,out)
# demo.launch()

---
🎉 **You visualised Naïve Bayes likelihoods and class uncertainty!**