In [1]:
# Jupyter cell: run as a shell command (start with !)
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib tqdm beautifulsoup4




In [2]:
import os
import json
import base64
from pathlib import Path
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from bs4 import BeautifulSoup
from tqdm import tqdm

# Project paths
PROJECT_ROOT = Path("C:/Users/navya/Desktop/IBY")  
CREDS_FILE = PROJECT_ROOT / "credentials.json"
TOKEN_FILE = PROJECT_ROOT / "token.json"
DATA_DIR = PROJECT_ROOT / "data"
OUT_JSONL = DATA_DIR / "emails_raw.jsonl"

# Gmail read-only scope
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]

# Create data directory if it doesn't exist
DATA_DIR.mkdir(exist_ok=True)

print("Project root:", PROJECT_ROOT.resolve())
print("Credentials file exists:", CREDS_FILE.exists())


Project root: C:\Users\navya\Desktop\IBY
Credentials file exists: True


In [3]:
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build

SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]  # read-only Gmail access

def get_gmail_service(creds_file=CREDS_FILE, token_file=TOKEN_FILE, scopes=SCOPES):
    creds = None

    # 1) Load existing token.json if it exists
    if token_file.exists():
        creds = Credentials.from_authorized_user_file(str(token_file), scopes)
    
    # 2) If no valid creds, either refresh or run OAuth flow
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
            print("Refreshed expired token.")
        else:
            flow = InstalledAppFlow.from_client_secrets_file(str(creds_file), scopes)
            creds = flow.run_local_server(port=0)  # opens browser to log in and approve
            print("Completed OAuth flow in browser.")
        
        # Save token.json for future use
        token_file.write_text(creds.to_json())
        print("Saved token.json to:", token_file)

    # 3) Build Gmail API service object
    service = build("gmail", "v1", credentials=creds)
    return service

# Run this cell — the first time it will open a browser to log in
service = get_gmail_service()
print("Gmail API service ready:", bool(service))


Refreshed expired token.
Saved token.json to: C:\Users\navya\Desktop\IBY\token.json
Gmail API service ready: True


In [4]:
import base64
from bs4 import BeautifulSoup
from tqdm import tqdm
import json

# Function to list message IDs
def list_message_ids(service, max_results=100, user_id="me"):
    """Return a list of Gmail message IDs"""
    try:
        response = service.users().messages().list(userId=user_id, maxResults=max_results).execute()
        return response.get("messages", [])
    except Exception as e:
        print("Error listing messages:", e)
        return []

# Function to fetch full message
def get_message(service, msg_id, user_id="me"):
    try:
        msg = service.users().messages().get(userId=user_id, id=msg_id, format="full").execute()
        return msg
    except Exception as e:
        print("Error fetching message:", e)
        return None

# Function to parse message
def parse_message(msg):
    """Extract subject, sender, date, body"""
    headers = {h['name']: h['value'] for h in msg['payload']['headers']}
    subject = headers.get('Subject', '')
    sender = headers.get('From', '')
    date = headers.get('Date', '')
    
    # Extract body
    body = ""
    parts = msg['payload'].get('parts', [])
    if parts:
        for p in parts:
            if p['mimeType'] == 'text/plain' and 'data' in p['body']:
                body = base64.urlsafe_b64decode(p['body']['data']).decode()
                break
            elif p['mimeType'] == 'text/html' and 'data' in p['body']:
                html = base64.urlsafe_b64decode(p['body']['data']).decode()
                body = BeautifulSoup(html, "html.parser").get_text()
                break
    else:
        if 'body' in msg['payload'] and 'data' in msg['payload']['body']:
            body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode()
    
    return {
        "subject": subject,
        "from": sender,
        "date": date,
        "body": body
    }

# Fetch emails
message_ids = list_message_ids(service, max_results=1000)  # change 50 to higher number if needed
emails = []

for m in tqdm(message_ids):
    msg = get_message(service, m['id'])
    if msg:
        emails.append(parse_message(msg))

# Save to JSONL
OUT_JSONL.parent.mkdir(exist_ok=True)  # ensure data/ exists
with open(OUT_JSONL, "w", encoding="utf-8") as f:
    for email in emails:
        f.write(json.dumps(email, ensure_ascii=False) + "\n")

print(f"Saved {len(emails)} emails to {OUT_JSONL}")


100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [05:59<00:00,  1.39it/s]

Saved 500 emails to C:\Users\navya\Desktop\IBY\data\emails_raw.jsonl



