In [None]:
import string
import numpy as np
import pandas as pd
import warnings
import textwrap
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from csv_trans import translate
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

warnings.filterwarnings('ignore')
print('Warnings ignored!!')

In [None]:
# nltk.download('stopwords', '/home/sang/Practice/machinelearning')

In [None]:
df = pd.read_csv('../data/spam_ham_dataset.csv')
df

In [None]:
df['text'] = df['text'].apply(lambda x:x.replace('\r\n', ' '))
df

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
print(textwrap.fill(df.text.iloc[0], width=85))

In [None]:
stemer = PorterStemmer()
corpus = []

nltk.data.path.append("/home/sang/Practice/machinelearning")
stopwords_set = set(stopwords.words('english'))

for i in range(len(df)):
    text = df['text'].iloc[i].lower()
    text = text.translate(str.maketrans('', '', string.punctuation)).split()
    text = [stemer.stem(word) for word in text if word not in stopwords_set]
    text = ' '.join(text)
    corpus.append(text)

In [None]:
print(textwrap.fill(corpus[0], width=85))

In [None]:
vectorizer = CountVectorizer(min_df=2, max_df=0.9)

X = vectorizer.fit_transform(corpus).toarray()
y = df.label_num

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 

In [None]:
clf = RandomForestClassifier(n_jobs=-1)

clf.fit(X_train, y_train)

In [None]:
print('Accuracy score: ', clf.score(X_test, y_test))

In [None]:
y_pred = clf.predict(X_test)
print('Classification report:\n', classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['ham', 'spam'])

disp.plot(cmap='Blues')
plt.show()

## Test model

In [None]:
email_to_classify = df.text.values[10]
print(textwrap.fill(email_to_classify, width=85))

In [None]:
email_text = email_to_classify.lower().translate(str.maketrans('', '', string.punctuation)).split()
email_text = [stemer.stem(word) for word in email_text if word not in stopwords_set]
email_text = ' '.join(email_text)

email_corpus = [email_text]

X_email = vectorizer.transform(email_corpus)

In [None]:
clf.predict(X_email)

In [None]:
df.label_num.iloc[10]

In [None]:
print(type(df.text))

In [None]:
test_email =   """Subject: Meeting Reminder – Project Update

Dear Team,

This is a friendly reminder that we have a project update meeting scheduled for tomorrow at 10:00 AM in the main conference room. Please come prepared with your progress reports and any questions you might have.

Looking forward to seeing everyone there.

Best regards,
Alice Johnson
Project Manager"""


In [None]:
email_text = test_email.lower().translate(str.maketrans('', '', string.punctuation)).split()
email_text = [stemer.stem(word) for word in email_text if word not in stopwords_set]
email_text = ' '.join(email_text)

email_corpus = [email_text]

X_email = vectorizer.transform(email_corpus)

clf.predict(X_email)

## Save model

In [None]:
import joblib
import os

os.makedirs('out_email_spam', exist_ok=True)

joblib.dump(clf, 'out_email_spam/clf_model.pkl')
joblib.dump(vectorizer, 'out_email_spam/vectorizer.pkl')

* Create app to have password *

[https://myaccount.google.com/u/1/apppasswords?rapt=AEjHL4M2uoLfIwxS-IQN5wUPGZWW04Cd3a3jqxx1BZEsJq64IyRP893FbCQ3P6KauVKUEqHS_K7lVrkaGMeudWpnU8iCYAq1rgN_KgZwdM493Hp4Cidhxek]

In [None]:
# import os
# import re
# import sys
# import email
# import joblib
# import imaplib
# import argparse
# from datetime import datetime
# from email.header import decode_header

# #==================================================
# #           Load model and vectorizer
# #==================================================
# def load_model_vectorizer(model_path, vect_path):
#     model = joblib.load(model_path)
#     vectorizer = joblib.load(vect_path)
#     return model, vectorizer

# #==================================================
# #           Predict spam probability
# #==================================================     
# def predict_spam_prob(model, vectorizer, texts):
#     X = vectorizer.transform(texts)
#     probs = model.predict_proba(X)[:, 1]
#     return probs.tolist()

# #==================================================
# #           Decode mime header
# #==================================================
# def decode_mime_words(s):
#     decoded = decode_header(s)
#     return "".join([
#         t[0].decode(t[1] or "utf-8") if isinstance(t[0], bytes) else str(t[0])
#         for t in decoded
#     ])

# #==================================================
# #                   CLI
# #==================================================
# def get_args():
#     parser = argparse.ArgumentParser(description='Email spam')
#     parser.add_argument('--email', type=str, default='abc@gmail.com', help='Gmail address')
#     parser.add_argument('--app_password', type=str, default='fkwi gqqv hhqq exti', help='App password (Google)')
#     parser.add_argument('--search', type=str, default='UNSEEN', help='IMAP search filter (default UNSEEN, use ALL to scan all)')
#     parser.add_argument('--model', type=str, default='/home/sang/Practice/machinelearning/out_email_spam/clf_model.pkl', help='Model path (*.pkl)')
#     parser.add_argument('--vectorizer', type=str, default='/home/sang/Practice/machinelearning/out_email_spam/vectorizer.pkl', help='Vectorizer path (*.pkl)')
#     parser.add_argument('--backup_dir', type=str, default=None, help='Backup .eml messages (default: None)')
#     parser.add_argument('--max_mail', type=int, default=100, help='Max emails per run')
#     parser.add_argument('--threshold', type=float, default=0.8, help='Spam threshold')
#     parser.add_argument('--dry_run', action='store_true', help='Do not actually move emails, just simulate')
#     return parser.parse_args()

# #==================================================
# #           Main mailbox processor
# #==================================================  
# def process_mailbox():
#     args = get_args()

#     # ------------------ Login ------------------
#     mail = imaplib.IMAP4_SSL('imap.gmail.com')
#     mail.login(args.email, args.app_password)
#     mail.select('inbox')

#     search_crit = args.search if args.search else 'UNSEEN'
#     type, data = mail.uid('search', None, search_crit)

#     if type != 'OK':
#         print('Search failed !!!')
#         return
    
#     uids = data[0].split()
#     print(f'Found {len(uids)} messages for search: {search_crit}')

#     # ------------------ Load model ------------------
#     model, vectorizer = load_model_vectorizer(args.model, args.vectorizer)
    
#     if args.backup_dir:
#         os.makedirs(args.backup_dir, exist_ok=True)

#     processed = 0
#     flagged = 0

#     for uid in uids[::-1]:
#         if args.max_mail and processed >= args.max_mail:
#             break

#         uid_str = uid.decode() if isinstance(uid, bytes) else str(uid)

#         type, msg_data = mail.uid('fetch', uid_str, 'RFC822')
#         if type != 'OK':
#             continue
        
#         raw_email = msg_data[0][1]
#         msg = email.message_from_bytes(raw_email)
        
#         subject = decode_mime_words(msg["Subject"]) if msg["Subject"] else ""
#         print("="*40)
#         print(f"UID: {uid_str}")
#         print(f"Subject: {subject}")

#         body = ""
#         if msg.is_multipart():
#             for part in msg.walk():
#                 if part.get_content_type() == "text/plain":
#                     try:
#                         body += part.get_payload(decode=True).decode("utf-8", errors="ignore")
#                     except Exception:
#                         pass
#         else:
#             try:
#                 body = msg.get_payload(decode=True).decode("utf-8", errors="ignore")
#             except Exception:
#                 pass

#         snippet = body[:200].replace("\n", " ").replace("\r", " ")
#         print(f"Snippet: {snippet}")

#         # backup raw eml
#         if args.backup_dir:
#             fn = f"{datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')}_{uid_str}.eml"
#             path = os.path.join(args.backup_dir, fn)
#             with open(path, "wb") as f:
#                 f.write(raw_email)

#         # ------------------ Predict spam ------------------
#         text_for_model = re.sub(r"\s+", " ", (subject + " " + body).lower())
#         prob_spam = predict_spam_prob(model, vectorizer, [text_for_model])[0]
#         print(f"Predicted spam probability: {prob_spam:.3f}")

#         if prob_spam >= args.threshold:
#             flagged += 1
#             print("Spam")  
#             if args.dry_run:
#                 print(f"[DRY_RUN] WOULD move UID {uid_str} to Trash")
#             else:
#                 try:
#                     # Gmail-specific move to Trash
#                     mail.uid("store", uid_str, "+X-GM-LABELS", "\\Trash")
#                     print("Moved to Trash")
#                 except Exception as e:
#                     print("Label failed, fallback delete:", e)
#                     mail.uid("store", uid_str, "+FLAGS", "(\\Deleted)")
#                     mail.expunge()
#         else:
#             print("Not flagged as spam.")

#         processed += 1

#     print("="*40)
#     print(f"Processed: {processed}, Flagged as spam: {flagged}")
#     mail.logout()

# if __name__ == '__main__':
#     process_mailbox()

In [None]:
import os
import re
import email
import imaplib
import joblib
from datetime import datetime
from email.header import decode_header

# --- Load model + vectorizer ---
def load_model_and_vectorizer(model_path, vec_path):
    model = joblib.load(model_path)
    vectorizer = joblib.load(vec_path)
    return model, vectorizer

# --- Predict spam probability ---
def predict_spam_prob(model, vectorizer, texts):
    X = vectorizer.transform(texts)
    probs = model.predict_proba(X)[:, 1]  # cột 1 = xác suất spam
    return probs.tolist()

# --- Helper: decode header ---
def decode_mime_words(s):
    decoded = decode_header(s)
    return "".join([
        t[0].decode(t[1] or "utf-8") if isinstance(t[0], bytes) else str(t[0])
        for t in decoded
    ])

# --- Main mailbox processor ---
def process_mailbox(args):
    # login
    mail = imaplib.IMAP4_SSL("imap.gmail.com")
    mail.login(args.email, args.app_password)
    mail.select("inbox")

    search_crit = args.search if args.search else "UNSEEN"
    typ, data = mail.uid("search", None, search_crit)
    if typ != "OK":
        print("Search failed.")
        return

    uids = data[0].split()
    print(f"Found {len(uids)} messages for search: {search_crit}")

    # load model
    model, vectorizer = load_model_and_vectorizer(args.model, args.vectorizer)

    os.makedirs(args.backup_dir, exist_ok=True)
    processed = 0
    flagged = 0

    for uid in uids[::-1]: 
        if args.max_mail and processed >= args.max_mail:
            break
        uid_str = uid.decode() if isinstance(uid, bytes) else str(uid)

        typ, msg_data = mail.uid("fetch", uid_str, "(RFC822)")
        if typ != "OK":
            continue

        raw_email = msg_data[0][1]
        msg = email.message_from_bytes(raw_email)

        subject = decode_mime_words(msg["Subject"]) if msg["Subject"] else ""
        print("="*40)
        print(f"UID: {uid_str}")
        print(f"Subject: {subject}")

        # body text
        body = ""
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain":
                    try:
                        body += part.get_payload(decode=True).decode("utf-8", errors="ignore")
                    except Exception:
                        pass
        else:
            try:
                body = msg.get_payload(decode=True).decode("utf-8", errors="ignore")
            except Exception:
                pass

        snippet = body[:200].replace("\n", " ").replace("\r", " ")
        print(f"Snippet: {snippet}")

        # backup raw eml
        if args.backup:
            fn = f"{datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')}_{uid_str}.eml"
            path = os.path.join(args.backup_dir, fn)
            with open(path, "wb") as f:
                f.write(raw_email)

        # --- Predict spam ---
        text_for_model = re.sub(r"\s+", " ", (subject + " " + body).lower())
        prob_spam = predict_spam_prob(model, vectorizer, [text_for_model])[0]
        print(f"Predicted spam probability: {prob_spam:.3f}")

        if prob_spam >= args.threshold:
            flagged += 1
            print("have spam")  
            if args.dry_run:
                print(f"[DRY_RUN] WOULD move UID {uid_str} to Trash")
            else:
                try:
                    mail.uid("store", uid_str, "+X-GM-LABELS", "\\Trash")
                    print("Moved to Trash")
                except Exception as e:
                    print("Label failed, fallback delete:", e)
                    mail.uid("store", uid_str, "+FLAGS", "(\\Deleted)")
                    mail.expunge()
        else:
            print("Not flagged as spam.")

        processed += 1

    print("="*40)
    print(f"Processed: {processed}, Flagged as spam: {flagged}")
    mail.logout()


def main():
    class Args:
        email = "abc@gmail.com"
        app_password = "fkwi gqqv hhqq exti"   
        model = "out_email_spam/clf_model.pkl"
        vectorizer = "out_email_spam/vectorizer.pkl"
        search = "UNSEEN" 
        backup_dir = "out_email_spam/backup" 
        max_mail = 10
        threshold = 0.8
        backup = True
        dry_run = False  

    args = Args()
    process_mailbox(args)


if __name__ == "__main__":
    main()
