In [1]:
import os.path
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

## 1. Clone email từ gmail

In [109]:
SCOPES = ["https://www.googleapis.com/auth/gmail.readonly"]

def main():

    creds = None
    messages_data = []

    if os.path.exists("token.json"):
        creds = Credentials.from_authorized_user_file("token.json", SCOPES)

    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
            "credentials.json", SCOPES
        )
            creds = flow.run_local_server(port=0)

    with open("token.json", "w") as token:
        token.write(creds.to_json())    
    try:
        service = build("gmail", "v1", credentials=creds)
        results = (
            service.users().messages().list(userId="me", labelIds=["INBOX"], maxResults=30).execute()
        )
        messages = results.get("messages", [])    
        if not messages:
            print("No messages found.")
            return    
        print("Messages:")
        i = 1
        for message in messages:
            print(f'Message {i} ID: {message["id"]}')
            msg = (
                service.users().messages().get(userId="me", id=message["id"]).execute()
            )
            print(f'  Subject: {msg["payload"]}') 
            i += 1
            messages_data.append([message["id"],msg["payload"]])
    except HttpError as error:
        print(f"An error occurred: {error}")    
    return messages_data
if __name__ == "__main__":
  messages = main()

Messages:
Message 1 ID: 1994e18622ac91c9
  Subject: {'partId': '', 'mimeType': 'multipart/alternative', 'filename': '', 'headers': [{'name': 'Delivered-To', 'value': 'ming2005hn1@gmail.com'}, {'name': 'Received', 'value': 'by 2002:a98:b884:0:b0:243:9be2:e923 with SMTP id z4csp331954eix;        Mon, 15 Sep 2025 08:57:25 -0700 (PDT)'}, {'name': 'X-Forwarded-Encrypted', 'value': 'i=2; AJvYcCWN1SaA3p2VLnaOAJif4Qwm714p2eWqAjSLmab0Y/uqIgKdAMnOlzz2GoR2cW5rh1nhpKNfxsuZ+hNiCQ==@gmail.com'}, {'name': 'X-Received', 'value': 'by 2002:a17:90b:1802:b0:32e:2b6e:78d1 with SMTP id 98e67ed59e1d1-32e2b6e7b51mr8547691a91.25.1757951845321;        Mon, 15 Sep 2025 08:57:25 -0700 (PDT)'}, {'name': 'ARC-Seal', 'value': 'i=1; a=rsa-sha256; t=1757951845; cv=none;        d=google.com; s=arc-20240605;        b=OlOhED3WrdUq9gpX1fUObUq6w4ep46EWPsf0rYKXWnYDwUvro9on9bqymoS0eDWwpc         0hYDMPfH6SGN0B1HxlJJ1REisedOkpi4jo6n++oBVeo0+TD5zskVxjCwC6OEE41WZId/         iDCtqosUL9lsmK8+4IttrlUE/5PIWywf5BgrcuIav+2bYuft9T/NV3

## 2. Lấy email người gửi

In [98]:
import re
# hàm trích xuất các gmail có trong text
def extract_emails(text):
    emails = re.findall(email_pattern, text)
    return emails

def get_from_header(headers):
    for header in headers:
        if header.get("name") == "From":
            return header.get("value")
    return None  

# tạo định dạng để bắt email trong chuỗi
email_pattern = r'[a-zA-Za-z0-9._%+-]+@[a-zA-Za-z0-9.-]+\.[a-zA-Za-z]{2,}'


print(messages[0][1]['headers'][19]['value'])
headers = messages[0][1]['headers']
from_value = get_from_header(headers)
email = extract_emails(from_value)
print(f'Email from: {email}')

"K17, Long Thi Thanh Hieu" <hieultths173311@fpt.edu.vn>
Email from: ['hieultths173311@fpt.edu.vn']


## 3. Decode nội dung thư

In [None]:
import base64


print(messages[0][1]['parts'][0]['body']['data'])
print('-'*80)
encoded_string = messages[0][1]['parts'][0]['body']['data']
decoded_bytes = base64.urlsafe_b64decode(encoded_string)
decoded_string = decoded_bytes.decode("utf-8")

print(decoded_string)  

QuG6oW4gxJFhbmcgZ-G6t3Aga2jDsyBraMSDbiB24bubaSBjw6FjIG3DtG4gaOG7jWMgdHLDqm4gQ291cnNlcmE_IELhuqFuIGxvIGzhuq9uZyB24buBIGPhu50gxJHhuqFvDQp2xINuLCBkZWFkbGluZSBj4bqtbiBr4buBIGhheSBjaMawYSBiaeG6v3QgY8OhY2ggaG_DoG4gdGjDoG5oIGLDoGkgxJHhu4MgxJHhuqF0IGNo4bupbmcgY2jhu4kNCsSRw7puZyBjaHXhuqluPw0KDQpIw6N5IMSR4buDIGNow7puZyB0w7RpIGdpw7pwIGLhuqFuIQ0KDQoqROG7i2NoIHbhu6UgaOG7lyB0cuG7oyBo4buNYyBDb3Vyc2VyYSBj4bunYSBjaMO6bmcgdMO0aSBtYW5nIMSR4bq_biBnaeG6o2kgcGjDoXAgdG_DoG4gZGnhu4duIGNobw0KaOG7jWMgdmnDqm4sIGNhbSBr4bq_dDoqDQoNCuKchSBIb8OgbiB0aMOgbmggdHLhu41uIGfDs2kgdOG6pXQgY-G6oyBjw6FjIG3DtG4gaOG7jWMgQ291cnNlcmEgKFNwZWNpYWxpemF0aW9uLA0KUHJvZmVzc2lvbmFsIENlcnRpZmljYXRl4oCmKQ0K4pyFIEJhbyBjaGVjayDEkeG6oW8gdsSDbiDigJMgMTAwJSBz4bqhY2ggVHVybml0aW4vR3JhbW1hcmx5DQrinIUgSOG7lyB0cuG7oyBn4buhIGPhu50gxJHhuqFvIHbEg24g4oCTIGvhu4MgY-G6oyB04burIDEgxJHhur9uIDEwIGPhu50NCuKchSBIb8OgbiB0aMOgbmggc-G7m20gxJHhu4MgYuG6oW4gxJHGsOG7o2MgY-G7mW5nIMSRaeG7g20gdHJvbmcga-G7syB0aGkNCuKchSBD4bupdSBo4buZIGPDoWMgY2Eg4oCca2jDsyBuaOG6sW7igJ0gbmjh

## 4. Tiến hành trích xuất 30 email đầu tiên

In [110]:
import pandas as pd
data = {
    'email_from': [],
    'data': [],
    'label': []
}
for item in range(30):
    headers = messages[item][1]['headers']
    from_value = get_from_header(headers)
    email = extract_emails(from_value)
    msg_payload = messages[item][1]['payload'] if 'payload' in messages[item][1] else messages[item][1]
    encoded_data = None
    if 'parts' in msg_payload and msg_payload['parts']:
        for part in msg_payload['parts']:
            if 'body' in part and 'data' in part['body']:
                encoded_data = part['body']['data']
                break
    if not encoded_data and 'body' in msg_payload and 'data' in msg_payload['body']:
        encoded_data = msg_payload['body']['data']
    if encoded_data:
        decoded_bytes = base64.urlsafe_b64decode(encoded_data)
        decoded_data = decoded_bytes.decode("utf-8", errors="replace")
    else:
        decoded_data = ""
    data['email_from'].append(email)
    data['data'].append(decoded_data)
    data['label'].append('None')

df = pd.DataFrame(data)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   email_from  30 non-null     object
 1   data        30 non-null     object
 2   label       30 non-null     object
dtypes: object(3)
memory usage: 848.0+ bytes
None


## 6. Lưu lại data đã trích xuất được

In [111]:
df.to_csv('30_emails.csv')