In [None]:
import os
from flask import Flask, render_template, request, redirect, url_for, session, flash
from flask_sqlalchemy import SQLAlchemy
from werkzeug.security import generate_password_hash, check_password_hash
import pandas as pd
import re
import joblib
import requests
from bs4 import BeautifulSoup
import json
import logging

# ===== Flask setup =====
logging.basicConfig(level=logging.DEBUG)
app = Flask(__name__, template_folder='my_templates')
app.config['SECRET_KEY'] = os.environ.get('SECRET_KEY', 'your_secret_key')
app.config['SQLALCHEMY_DATABASE_URI'] = 'sqlite:///job_app.db'
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False

db = SQLAlchemy(app)

# ===== Models =====
class User(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    username = db.Column(db.String(150), unique=True, nullable=False)
    password_hash = db.Column(db.String(200), nullable=False)
    profile = db.relationship('Profile', backref='user', uselist=False)
    alerts = db.relationship('Alert', backref='user')
    seen_offers = db.relationship('JobOffer', backref='user')

class Profile(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False)
    job_title = db.Column(db.String(200))
    location = db.Column(db.String(200))
    experience = db.Column(db.Text)
    education = db.Column(db.String(200))
    skills = db.Column(db.String(500))
    languages = db.Column(db.String(200))

class Alert(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False)
    company_name = db.Column(db.String(200), nullable=False)
    timestamp = db.Column(db.DateTime, server_default=db.func.now())

class JobOffer(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False)
    url = db.Column(db.String(500), nullable=False)
    data = db.Column(db.JSON)
    label = db.Column(db.String(50))
    probability = db.Column(db.Float)
    timestamp = db.Column(db.DateTime, server_default=db.func.now())

# ===== Load ML pipeline =====
try:
    BASE_DIR = os.path.abspath(os.path.dirname(__file__))
except NameError:
    BASE_DIR = os.getcwd()
PIPELINE_PATH = os.path.join(BASE_DIR, 'rf_pipeline.pkl')
if not os.path.exists(PIPELINE_PATH):
    raise FileNotFoundError("rf_pipeline.pkl introuvable; entraînez et sauvegardez le modèle.")
model_pipeline = joblib.load(PIPELINE_PATH)

# ===== Load roadmap mapping =====
ROADMAP_PATH = os.path.join(BASE_DIR, 'roadmap.json')
if not os.path.exists(ROADMAP_PATH):
    raise FileNotFoundError("roadmap.json introuvable; placez-le à côté du script.")
with open(ROADMAP_PATH, 'r', encoding='utf-8') as f:
    ROADMAP = json.load(f)

# ===== Text cleaning =====
def clean_text(text):
    text = str(text) if text else ''
    text = text.lower()
    text = re.sub(r"\s+", ' ', text)
    text = re.sub(r"[^a-z0-9\s]", '', text)
    return text.strip()

# ===== Routes =====
@app.route('/')
def index():
    return redirect(url_for('login'))

@app.route('/register', methods=['GET', 'POST'])
def register():
    if request.method == 'POST':
        username = request.form['username']
        password = request.form['password']
        if User.query.filter_by(username=username).first():
            flash('Ce nom d’utilisateur existe déjà.')
            return redirect(url_for('register'))
        user = User(username=username, password_hash=generate_password_hash(password))
        db.session.add(user)
        db.session.commit()
        flash('Inscription réussie. Veuillez vous connecter.')
        return redirect(url_for('login'))
    return render_template('register.html')

@app.route('/login', methods=['GET', 'POST'])
def login():
    if request.method == 'POST':
        username = request.form['username']
        password = request.form['password']
        user = User.query.filter_by(username=username).first()
        if user and check_password_hash(user.password_hash, password):
            session['user_id'] = user.id
            return redirect(url_for('search_jobs'))
        flash('Identifiants invalides.')
    return render_template('login.html')

@app.route('/profile', methods=['GET', 'POST'])
def profile():
    if 'user_id' not in session:
        return redirect(url_for('login'))
    user = User.query.get(session['user_id'])
    if request.method == 'POST':
        user.profile = user.profile or Profile(user_id=user.id)
        user.profile.job_title = request.form['job_title']
        user.profile.location = request.form['location']
        user.profile.experience = request.form['experience']
        user.profile.education = request.form['education']
        user.profile.skills = request.form.get('skills', '')
        user.profile.languages = request.form.get('languages', '')
        db.session.add(user.profile)
        db.session.commit()
        flash('Profil enregistré.')
    return render_template('profile.html', profile=user.profile)

@app.route('/search_jobs')
def search_jobs():
    if 'user_id' not in session:
        return redirect(url_for('login'))
    user = User.query.get(session['user_id'])
    if not user.profile:
        flash('Veuillez d’abord créer votre profil.')
        return redirect(url_for('profile'))

    # Requête Google
    query = f"{user.profile.job_title} jobs in {user.profile.location}"
    logging.debug(f"[DEBUG] Search query: {query}")
    # Requête Google avec headers plus complets pour éviter blocage
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
        'Accept-Language': 'fr-FR,fr;q=0.9'
    }
    resp = requests.get(
        f"https://www.google.com/search?q={requests.utils.quote(query)}&num=15",
        headers=headers,
        timeout=10
    )
    # Parse Google search results for real URLs
    soup = BeautifulSoup(resp.text, 'html.parser')
    raw_links = soup.select('a[href^="/url?q="]')
    urls = []
    for link in raw_links:
        href = link['href']
        if href.startswith('/url?q='):
            real_url = href.split('/url?q=')[1].split('&')[0]
            if real_url.startswith('http') and real_url not in urls:
                urls.append(real_url)
        if len(urls) >= 15:
            break
    logging.debug(f"[DEBUG] Retrieved URLs ({len(urls)}): {urls}")

    results = []
    for url in urls:
        logging.debug(f"[DEBUG] Processing URL: {url}")
        if JobOffer.query.filter_by(user_id=user.id, url=url).first():
            logging.debug(f"[DEBUG] URL already processed: {url}")
            continue
        try:
            page = requests.get(url, timeout=5, headers={'User-Agent': 'Mozilla/5.0'})
            soup_page = BeautifulSoup(page.text, 'html.parser')
            data = {}
            for cfg in ROADMAP:
                key = cfg['name']
                sel = cfg.get('cssSelector')
                el = soup_page.select_one(sel) if sel else None
                data[key] = el.get('src') if el and el.name.lower()=='img' else (el.get_text(strip=True) if el else '')
            logging.debug(f"[DEBUG] Extracted data: {data}")

            title = data.get('Element 3') or data.get('Element 1')
            if not title:
                logging.debug("[DEBUG] No title found, skipping offer")
                continue

            combined = ' '.join(data.values())
            txt = clean_text(combined)
            df = pd.DataFrame([{'combined_text': txt}])
            proba = model_pipeline.predict_proba(df)[0][1]
            pred = model_pipeline.predict(df)[0]
            label = 'FRAUDULEUSE' if pred == 1 else 'LÉGITIME'
            logging.debug(f"[DEBUG] Prediction: {label} at {proba*100:.2f}%")

            job = JobOffer(user_id=user.id, url=url, data=data, label=label, probability=round(proba*100,2))
            db.session.add(job)
            if pred == 1 and data.get('Element 2'):
                if not Alert.query.filter_by(user_id=user.id, company_name=data['Element 2']).first():
                    db.session.add(Alert(user_id=user.id, company_name=data['Element 2']))
            results.append({**data, 'url': url, 'label': label, 'probability': round(proba*100,2)})
        except Exception as e:
            logging.error(f"[ERROR] Failed processing {url}: {e}")
            continue

    if not results:
        flash('Aucune offre trouvée, élargissez votre recherche.')
    db.session.commit()
    return render_template('job_results.html', offers=results)

@app.route('/alerts')
def alerts():
    user = User.query.get(session.get('user_id'))
    return render_template('alerts.html', alerts=user.alerts if user else [])

@app.route('/logout')
def logout():
    session.clear()
    return redirect(url_for('login'))

if __name__ == '__main__':
    with app.app_context():
        db.create_all()
    app.run(debug=True, use_reloader=False)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


 * Serving Flask app '__main__'
 * Debug mode: on


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
  user = User.query.get(session['user_id'])
DEBUG:root:[DEBUG] Search query: Data Scientist jobs in France
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.google.com:443
DEBUG:urllib3.connectionpool:https://www.google.com:443 "GET /search?q=Data%20Scientist%20jobs%20in%20France&num=15 HTTP/11" 200 None
DEBUG:root:[DEBUG] Retrieved URLs (0): []
INFO:werkzeug:127.0.0.1 - - [14/May/2025 21:16:15] "GET /search_jobs HTTP/1.1" 200 -
  user = User.query.get(session['user_id'])
INFO:werkzeug:127.0.0.1 - - [14/May/2025 21:16:16] "GET /profile HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [14/May/2025 21:16:18] "POST /profile HT