In [4]:
import os
import re
import pandas as pd
from datetime import datetime
from base64 import urlsafe_b64decode
from bs4 import BeautifulSoup

from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

# ============================================================
# CONFIG
# ============================================================

SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

SOURCE_EMAIL = "jobs-noreply@linkedin.com"

OUTPUT_FILE = "data_to_label_linkedin.xlsx"

QUERY = f"from:{SOURCE_EMAIL}"

# Pattern à exclure (confirmation d'envoi de candidature)
EXCLUDED_SUBJECT_PATTERN = "julien, votre candidature a été envoyée à"

# ============================================================
# AUTHENTIFICATION
# ============================================================

def get_authenticated_service():
    creds = None

    if os.path.exists("token.json"):
        creds = Credentials.from_authorized_user_file("token.json", SCOPES)

    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                "gmail_credentials.json",
                SCOPES
            )
            creds = flow.run_local_server(port=0)

        with open("token.json", "w") as token:
            token.write(creds.to_json())

    return build("gmail", "v1", credentials=creds)

# ============================================================
# UTILS
# ============================================================

def decode_body(data):
    if not data:
        return ""
    return urlsafe_b64decode(data).decode("utf-8", errors="ignore")

def clean_html(html):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "img", "footer", "header", "svg"]):
        tag.decompose()
    return soup.get_text(separator="\n", strip=True)

def extract_message_text(payload):
    mime_type = payload.get("mimeType", "")

    if mime_type == "text/plain":
        return decode_body(payload.get("body", {}).get("data", ""))

    if mime_type == "text/html":
        html = decode_body(payload.get("body", {}).get("data", ""))
        return clean_html(html)

    if mime_type.startswith("multipart/"):
        for part in payload.get("parts", []):
            text = extract_message_text(part)
            if text:
                return text

    return ""

def get_header(headers, name):
    return next(
        (h["value"] for h in headers if h["name"].lower() == name.lower()),
        ""
    )

def extract_email(from_header):
    match = re.search(r"<([^>]+)>", from_header or "")
    return match.group(1) if match else from_header.strip()

# ============================================================
# MAIN
# ============================================================

def main():
    service = get_authenticated_service()

    rows = []
    next_page_token = None
    skipped = 0

    while True:
        response = service.users().messages().list(
            userId="me",
            q=QUERY,
            maxResults=100,
            pageToken=next_page_token
        ).execute()

        messages = response.get("messages", [])
        if not messages:
            break

        for msg in messages:
            msg_detail = service.users().messages().get(
                userId="me",
                id=msg["id"],
                format="full"
            ).execute()

            payload = msg_detail.get("payload", {})
            headers = payload.get("headers", [])

            subject = get_header(headers, "Subject")

            # EXCLUSION PAR OBJET
            if EXCLUDED_SUBJECT_PATTERN in subject.lower():
                skipped += 1
                continue

            sender = extract_email(get_header(headers, "From"))

            internal_date = int(msg_detail.get("internalDate", 0))
            date = datetime.fromtimestamp(
                internal_date / 1000
            ).strftime("%Y-%m-%d %H:%M:%S")

            body = extract_message_text(payload)

            rows.append({
                "subject": subject,
                "body": body,
                "from": sender,
                "date": date,
                "label": ""  # annotation manuelle
            })

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    df = pd.DataFrame(rows)
    df.to_excel(OUTPUT_FILE, index=False)

    print("Dataset créé :", OUTPUT_FILE)
    print(f"Mails conservés : {len(df)}")
    print(f"Mails exclus (confirmation d'envoi) : {skipped}")

# ============================================================
# ENTRY POINT
# ============================================================

if __name__ == "__main__":
    main()


Dataset créé : data_to_label_linkedin.xlsx
Mails conservés : 103
Mails exclus (confirmation d'envoi) : 92


In [10]:
import pandas as pd

# ============================================================
# POST-TRAITEMENT DES LABELS
# ============================================================

FILE = "data_to_label_linkedin.xlsx"

df = pd.read_excel(FILE)

def assign_label(subject):
    if not isinstance(subject, str):
        return 0

    subject_lower = subject.lower()

    # PRIORITÉ 1 : candidature vue
    if "a été vue" in subject_lower:
        return 4

    # PRIORITÉ 2 : confirmation candidature
    if subject_lower.startswith("votre candidature"):
        return 5

    return 0


df["label"] = df["subject"].apply(assign_label)

df.to_excel(FILE, index=False)

print("Labels automatiquement assignés")
print(df["label"].value_counts())


Labels automatiquement assignés
0    68
5    23
4    12
Name: label, dtype: int64
