<a href="https://colab.research.google.com/github/Navyasree17-J/Mini-Projects/blob/main/Smart_Email_Classifier_with_Gmail_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib nltk gradio


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.2/13.2 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.9/46.9 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m322.2/322.2 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.3/62.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import base64
import re
import gradio as gr
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from email import message_from_bytes

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
creds = flow.run_local_server(port=0)
service = build('gmail', 'v1', credentials=creds)


FileNotFoundError: [Errno 2] No such file or directory: 'credentials.json'

In [None]:
def get_email_bodies(n=10):
    messages = service.users().messages().list(userId='me', maxResults=n).execute().get('messages', [])
    emails = []
    for msg in messages:
        msg_data = service.users().messages().get(userId='me', id=msg['id'], format='raw').execute()
        msg_str = base64.urlsafe_b64decode(msg_data['raw'].encode('ASCII'))
        mime_msg = message_from_bytes(msg_str)
        if mime_msg.is_multipart():
            body = mime_msg.get_payload(0).get_payload(decode=True)
        else:
            body = mime_msg.get_payload(decode=True)
        if body:
            text = body.decode(errors='ignore')
            emails.append(text)
    return emails


In [None]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    cleaned = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(cleaned)


In [None]:
url = "https://raw.githubusercontent.com/Navyasree17-J/Mini-Projects/main/Email_classification1.csv"
df = pd.read_csv(url)

df.rename(columns={"text": "Email", "spam": "Label"}, inplace=True)
df['Label'] = df['Label'].map({1: 'Spam', 0: 'Important'})

promo_sample = df[df['Label'] == 'Important'].sample(frac=0.2, random_state=42).index
df.loc[promo_sample, 'Label'] = 'Promotional'

df['CleanText'] = df['Email'].apply(preprocess)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['CleanText'])
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)


In [None]:
def classify_emails_from_gmail(n=5):
    emails = get_email_bodies(n)
    results = []
    for email in emails:
        cleaned = preprocess(email)
        vect = vectorizer.transform([cleaned])
        pred = model.predict(vect)[0]
        results.append((email[:200] + "...", pred))
    return results


In [None]:
gr.Interface(
    fn=classify_emails_from_gmail,
    inputs=gr.Slider(minimum=1, maximum=10, step=1, label="Number of Emails"),
    outputs=gr.Dataframe(headers=["Email Snippet", "Category"]),
    title="📬 Smart Email Classifier (Gmail Auto-Fetch)",
    description="Fetches your recent Gmail emails and classifies them as Spam, Important, or Promotional."
).launch(share=True)
