## Script de création du dataset Linkedin

In [2]:
import os
import re
import pandas as pd
from datetime import datetime
from base64 import urlsafe_b64decode
from bs4 import BeautifulSoup

from google.oauth2.credentials import Credentials
from google.auth.transport.requests import Request
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build

# ============================================================
# CONFIG
# ============================================================

SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
OUTPUT_FILE = "data_labeled_linkedin.xlsx"

QUERY = 'subject:"votre candidature a été envoyée à"'

# ============================================================
# AUTHENTIFICATION GMAIL
# ============================================================

def get_authenticated_service():
    creds = None

    if os.path.exists("token.json"):
        creds = Credentials.from_authorized_user_file("token.json", SCOPES)

    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                "gmail_credentials.json",
                SCOPES
            )
            creds = flow.run_local_server(port=0)

        with open("token.json", "w") as token:
            token.write(creds.to_json())

    return build("gmail", "v1", credentials=creds)

# ============================================================
# UTILITAIRES
# ============================================================

def decode_body(data):
    if not data:
        return ""
    return urlsafe_b64decode(data).decode("utf-8", errors="ignore")

def clean_html(html):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "img", "footer", "header", "svg"]):
        tag.decompose()
    return soup.get_text(separator="\n", strip=True)

def extract_message_text(payload):
    mime_type = payload.get("mimeType", "")

    if mime_type == "text/plain":
        return decode_body(payload.get("body", {}).get("data", ""))

    if mime_type == "text/html":
        html = decode_body(payload.get("body", {}).get("data", ""))
        return clean_html(html)

    if mime_type.startswith("multipart/"):
        for part in payload.get("parts", []):
            text = extract_message_text(part)
            if text:
                return text

    return ""

def get_header(headers, name):
    return next(
        (h["value"] for h in headers if h["name"].lower() == name.lower()),
        ""
    )

def extract_email(from_header):
    """
    Extrait uniquement l'adresse email depuis :
    'LinkedIn Jobs <jobs-noreply@linkedin.com>'
    """
    if not from_header:
        return ""

    match = re.search(r"<([^>]+)>", from_header)
    if match:
        return match.group(1)

    return from_header.strip()

# ============================================================
# MAIN
# ============================================================

def main():
    service = get_authenticated_service()

    all_rows = []
    next_page_token = None

    while True:
        response = service.users().messages().list(
            userId="me",
            q=QUERY,
            maxResults=100,
            pageToken=next_page_token
        ).execute()

        messages = response.get("messages", [])
        if not messages:
            break

        for msg in messages:
            msg_detail = service.users().messages().get(
                userId="me",
                id=msg["id"],
                format="full"
            ).execute()

            payload = msg_detail.get("payload", {})
            headers = payload.get("headers", [])

            subject = get_header(headers, "Subject")
            raw_from = get_header(headers, "From")
            sender = extract_email(raw_from)

            internal_date = int(msg_detail.get("internalDate", 0))
            date = datetime.fromtimestamp(internal_date / 1000).strftime("%d/%m/%Y %H:%M:%S")

            body = extract_message_text(payload)

            all_rows.append({
                "subject": subject,
                "body": body,
                "from": sender,
                "date": date,
                "label": 1
            })

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    df = pd.DataFrame(all_rows)
    df.to_excel(OUTPUT_FILE, index=False)

    print(f"{len(df)} mails exportés vers {OUTPUT_FILE}")

# ============================================================
# ENTRY POINT
# ============================================================

if __name__ == "__main__":
    main()


92 mails exportés vers data_labeled_linkedin.xlsx
