In [None]:
import pandas as pd
import numpy as np
import pickle # Pour sauvegarder le mod√®le

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score

csv_path = "C:/Users/Etu/Desktop/cleaned_dataset_domaine_info.csv" 
df = pd.read_csv(csv_path)

print(f"üìä Donn√©es initiales : {len(df)} lignes")


# par precaution  On supprime les lignes sans salaire (Target)
df = df.dropna(subset=['salaire_avg'])

# On remplit les textes vides par "" pour √©viter les crashs
text_cols = ['titre', 'description', 'competences', 'metier', 'experience', 'region']
for col in text_cols:
    df[col] = df[col].fillna("")

# B. SHUFFLE 

df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print("‚úÖ Donn√©es m√©lang√©es al√©atoirement.")

# ==========================================
# 2. FEATURE ENGINEERING
# ==========================================


# On donne du poids au TITRE en le r√©p√©tant 2 fois
df['text_features'] = (
    df['titre'] + " " + df['titre'] + " " + 
    df['description'] + " " + 
    df['competences']
)

#  D√©finition des Variables (X) et de la Cible (y)
features = ['text_features', 'metier', 'experience', 'region']
X = df[features]
y = df['salaire_avg']


# 3. CONSTRUCTION DU PIPELINE du pipline du MOD√àLE


# 1. Traitement du Texte (TF-IDF)
tfidf = TfidfVectorizer(
    stop_words='english', # Supprime les mots inutiles
    max_features=5000,    # On garde les 5000 mots les plus importants
    ngram_range=(1, 2)    # Mots simples et paires 
)

# 2. Traitement des Cat√©gories (OneHot)
# Transforme "Paris", "Lyon" en colonnes binaires
categorical = OneHotEncoder(handle_unknown='ignore')

# 3. Assemblage du Processeur
preprocessor = ColumnTransformer(
    transformers=[
        ('txt', tfidf, 'text_features'),
        ('cat', categorical, ['metier', 'experience', 'region'])
    ]
)

# 4. Pipeline Final (Processeur + Random Forest)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])


# 4. ENTRA√éNEMENT


# S√©paration Train/Test (80% / 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f" Entra√Ænement sur {len(X_train)} offres... (Patientez)")
model.fit(X_train, y_train)

# ==========================================
# 5. VALIDATION & SCORES
# ==========================================
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("-" * 30)
print(" R√âSULTATS DU MOD√àLE")
print("-" * 30)
print(f" Erreur Moyenne (MAE) : {mae:.0f} ‚Ç¨ / an")
print(f"Pr√©cision (R¬≤)       : {r2:.3f}")

if r2 > 0.7:
    print("Le mod√®le est PERFORMANT !")
else:
    print(" Le mod√®le peut √™tre am√©lior√©.")


# 6. TEST DE PR√âDICTION (Simulation)

print("\n Test R√©el :")
offre_test = pd.DataFrame({
    'text_features': ["D√©veloppeur Python Senior Django Flask API REST SQL"],
    'metier': ["D√©veloppeur Logiciel"],
    'experience': ["Senior (5+ ans)"],
    'region': ["√éle-de-France"]
})

pred = model.predict(offre_test)[0]
print(f"Offre : Dev Python Senior √† Paris")
print(f"Salaire estim√© : {pred:.0f} ‚Ç¨")


# 7. SAUVEGARDE DU MOD√àLE 

with open("C:/Users/Etu/Desktop/salary_prediction_model.pkl", "wb") as f:
    pickle.dump(model, f)
    
print("\nMod√®le sauvegard√© dans 'salary_prediction_model.pkl'")


In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score

# 1. CHARGEMENT
csv_path = "C:/Users/Etu/Desktop/cleaned_dataset_domaine_info.csv"
df = pd.read_csv(csv_path)

# Nettoyage de base
df = df.dropna(subset=['salaire_avg'])
text_cols = ['titre', 'description', 'competences', 'metier', 'experience', 'region']
for col in text_cols: df[col] = df[col].fillna("")

print(f"üìä Donn√©es avant filtre : {len(df)}")

# 2. FILTRE DOUX (Seulement les erreurs manifestes)
# On ne supprime QUE ce qui est impossible √† vivre (< 14k)
# On GARDE les gros salaires (car ils sont r√©els dans la Tech)
df = df[df['salaire_avg'] > 14000]

print(f" Donn√©es apr√®s filtre : {len(df)} (On a gard√© le maximum de mati√®re)")

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# 3. FEATURES
df['text_features'] = (
    df['titre'] + " " + df['titre'] + " " + 
    df['metier'] + " " + 
    df['competences'] + " " + 
    df['description']
)

X = df[['text_features', 'metier', 'experience', 'region']]
y = df['salaire_avg']

# 4. PIPELINE (RETOUR √Ä LA PUISSANCE BRUTE)
tfidf = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
categorical = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('txt', tfidf, 'text_features'),
        ('cat', categorical, ['metier', 'experience', 'region'])
    ]
)

# RANDOM FOREST PUISSANT
# - n_estimators=200 : Suffisant et stable
# - ON ENL√àVE 'max_features' : Il regarde tout (Lent mais Pr√©cis)
# - ON ENL√àVE 'min_samples_leaf' : On le laisse apprendre les d√©tails
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        n_estimators=200, 
        random_state=42, 
        n_jobs=-1 # Utilise tous les coeurs du PC pour aller vite quand m√™me
    ))
])

# 5. ENTRA√éNEMENT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Entra√Ænement sur {len(X_train)} offres")
model.fit(X_train, y_train)

# 6. R√âSULTATS
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("-" * 30)
print(f" MAE : {mae:.0f} ‚Ç¨")
print(f" R¬≤  : {r2:.4f}")
print("-" * 30)

# Sauvegarde
with open("C:/Users/Etu/Desktop/salary_prediction_model_final.pkl", "wb") as f:
    pickle.dump(model, f)

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score


csv_path = "C:/Users/Etu/Desktop/cleaned_dataset_domaine_info.csv"
df = pd.read_csv(csv_path)

# Nettoyage
df = df.dropna(subset=['salaire_avg'])

df = df[df['salaire_avg'] > 14000]

text_cols = ['titre', 'description', 'competences', 'metier', 'experience', 'region']
for col in text_cols: df[col] = df[col].fillna("")

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


# 2. FEATURES

df['text_features'] = (
    df['titre'] + " " + df['titre'] + " " + 
    df['metier'] + " " + 
    df['competences'] + " " + 
    df['description']
)

X = df[['text_features', 'metier', 'experience', 'region']]
y = df['salaire_avg']


# pipeline xgboost


# TF-IDF (On garde une bonne taille pour nourrir le XGBoost)
tfidf = TfidfVectorizer(
    stop_words='english', 
    max_features=5000,
    ngram_range=(1, 2)
)

categorical = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('txt', tfidf, 'text_features'),
        ('cat', categorical, ['metier', 'experience', 'region'])
    ]
)


model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(
        n_estimators=1000, 
        learning_rate=0.05, 
        max_depth=6, 
        subsample=0.8,      
        colsample_bytree=0.8, 
        random_state=42,
        n_jobs=-1
    ))
])


#  entrainement

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f" Entra√Ænement XGBoost sur {len(X_train)} offres...")
model.fit(X_train, y_train)


y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


print(" RESULTATS XGBOOST")
print("-" * 30)
print(f" MAE : {mae:.0f} ‚Ç¨")
print(f" R¬≤  : {r2:.4f}")




with open("C:/Users/Etu/Desktop/salary_model_xgboost2.pkl", "wb") as f:
    pickle.dump(model, f)
print("\nMod√®le XGBoost sauvegard√©.")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# 1. Configuration de la taille
plt.figure(figsize=(10, 7))


plt.scatter(y_test, y_pred, color='#3b82f6', alpha=0.5, s=60, edgecolors='white', label='Pr√©dictions')

# 3. La Ligne Rouge "Id√©ale" (Si tout √©tait parfait)
# On trace une ligne du min au max
p1 = max(max(y_pred), max(y_test))
p2 = min(min(y_pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'r--', linewidth=2, label='Perfect (Id√©al)')


texte_scores = (
    f"MODELE XGBOOST\n"
    f"-----------------------\n"
    f"R¬≤ (regression metric) = {r2:.3f}\n"  # ex: 0.706
    f"MAE (Erreur)   = {mae:.0f} ‚Ç¨" # ex: 3954 ‚Ç¨
)

# On place la bo√Æte (bbox)
plt.text(0.05, 0.95, texte_scores, transform=plt.gca().transAxes,
         fontsize=12, verticalalignment='top',
         bbox=dict(boxstyle='round', facecolor='white', alpha=0.9, edgecolor='gray'))

# 5. Titres et Labels
plt.title(f'Performance : Salaires R√©els vs Salaires Pr√©dits', fontsize=15, fontweight='bold')
plt.xlabel('Vrai Salaire Annuel (‚Ç¨)', fontsize=12)
plt.ylabel('Salaire Estim√© par le mod√®le (‚Ç¨)', fontsize=12)
plt.legend(loc='lower right')
plt.grid(True, linestyle='--', alpha=0.6)

# 6. Affichage
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score


X_train_full, X_test_fixe, y_train_full, y_test_fixe = train_test_split(X, y, test_size=0.2, random_state=42)


train_sizes = np.linspace(0.1, 1.0, 6) # 6 √©tapes
r2_scores = []
mae_scores = []
n_rows = []



# 2. LA BOUCLE D'ENTRA√éNEMENT PROGRESSIF
for fraction in train_sizes:
    # On prend une fraction des donn√©es d'entra√Ænement (ex: les 1000 premi√®res lignes, puis 2000...)
    # On convertit la fraction en nombre entier
    size = int(len(X_train_full) * fraction) 
    
    # D√©coupage des donn√©es partielles
    X_partial = X_train_full[:size]
    y_partial = y_train_full[:size]
    
    # On entra√Æne le mod√®le (XGBoost) sur cette petite partie
    # Note : On utilise le m√™me pipeline 'model' que tu as d√©fini avant
    model.fit(X_partial, y_partial)
    
    # On teste sur le jeu de test FIXE (toujours le m√™me pour bien comparer)
    y_pred_partial = model.predict(X_test_fixe)
    
    # On enregistre les scores
    r2 = r2_score(y_test_fixe, y_pred_partial)
    mae = mean_absolute_error(y_test_fixe, y_pred_partial)
    
    r2_scores.append(r2)
    mae_scores.append(mae)
    n_rows.append(size)
    
    print(f" Entrainer sur {size} lignes -> R¬≤={r2:.3f} | MAE={mae:.0f}‚Ç¨")

# 3. VISUALISATION (Double Graphique)
fig, ax1 = plt.subplots(figsize=(12, 6))

# Axe Y gauche : Le R¬≤ (Doit monter)
color = 'tab:blue'
ax1.set_xlabel('Nombre de lignes utilis√©es pour l\'entra√Ænement', fontsize=12)
ax1.set_ylabel('regresion metric ($R^2$)', color=color, fontsize=12)
ax1.plot(n_rows, r2_scores, marker='o', color=color, linewidth=3, label='$R^2$ (Pr√©cision)')
ax1.tick_params(axis='y', labelcolor=color)
ax1.grid(True, linestyle='--', alpha=0.5)

# Axe Y droit : Le MAE (Doit descendre)
ax2 = ax1.twinx()  
color = 'tab:red'
ax2.set_ylabel('Mean error (MAE en ‚Ç¨)', color=color, fontsize=12)
ax2.plot(n_rows, mae_scores, marker='s', color=color, linewidth=3, linestyle='--', label='MAE Erreur')
ax2.tick_params(axis='y', labelcolor=color)

# Titre et Mise en forme
plt.title("Courbe D'apprentisage", fontsize=14, fontweight='bold')
fig.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, r2_score


csv_path = "C:/Users/Etu/Desktop/cleaned_dataset_domaine_info.csv"
df = pd.read_csv(csv_path)

# Nettoyage
df = df.dropna(subset=['salaire_avg'])
df = df[df['salaire_avg'] > 14000]

text_cols = ['titre', 'description', 'competences', 'metier', 'experience', 'region']
for col in text_cols: df[col] = df[col].fillna("")

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


# 2. PR√âPARATION DES DONN√âES (X et y)

df['text_features'] = (
    df['titre'] + " " + df['titre'] + " " + 
    df['metier'] + " " + 
    df['competences'] + " " + 
    df['description']
)

X = df[['text_features', 'metier', 'experience', 'region']]
y = df['salaire_avg'].values # .values pour avoir un tableau numpy


# 3. VECTORISATION 



# TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
# OneHot
categorical = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('txt', tfidf, 'text_features'),
        ('cat', categorical, ['metier', 'experience', 'region'])
    ],
    verbose_feature_names_out=False
)

print(" Transformation des donn√©es pour le r√©seau de neurones...")
X_transformed = preprocessor.fit_transform(X)

# Conversion en dense (si le TF-IDF sort du sparse) pour que TensorFlow soit content
if hasattr(X_transformed, "toarray"):
    X_transformed = X_transformed.toarray()

# Split Train/Test
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

input_dim = X_train.shape[1]
print(f"üìä Entr√©es du r√©seau : {input_dim} neurones (Mots + Cat√©gories)")


# ARCHITECTURE "L√âG√àRE" 

model = Sequential()

# On passe de 256 √† 64 neurones en entr√©e

model.add(Dense(64, input_dim=input_dim, activation='relu'))
model.add(Dropout(0.4)) 

# Une seule couche cach√©e de 32 neurones 
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

# Sortie
model.add(Dense(1, activation='linear'))

# On garde le m√™me optimiseur
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_absolute_error')


# 5. ENTRA√éNEMENT (Training)

# Early Stopping : Arr√™te si le mod√®le ne s'am√©liore plus apr√®s 10 √©poques
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

print("D√©marrage du Deep Learning...")
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,       # Max essais
    batch_size=32,    # Paquets de donn√©es
    callbacks=[early_stop],
    verbose=1
)


# 6. √âVALUATION FINALE

y_pred = model.predict(X_test).flatten() # flatten pour avoir un tableau 1D

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n" + "="*30)
print(" R√âSULTATS DEEP LEARNING (TENSORFLOW)")
print("="*30)
print(f" MAE : {mae:.0f} ‚Ç¨")
print(f" R¬≤  : {r2:.4f}")


# On sauvegarde le mod√®le au format .keras 
model.save("C:/Users/Etu/Desktop/salary_model_deeplearning.keras")
print("\nüíæ Mod√®le Deep Learning sauvegard√© dans 'salary_model_deeplearning.keras'")

# On doit AUSSI sauvegarder le pr√©processeur (TF-IDF + OneHot)
# Sinon, on ne pourra pas transformer les nouveaux textes pour l'utiliser !
import pickle
with open("C:/Users/Etu/Desktop/preprocessor_dl.pkl", "wb") as f:
    pickle.dump(preprocessor, f)
print("Pr√©processeur sauvegard√© dans 'preprocessor_dl.pkl'")

In [None]:
import matplotlib.pyplot as plt

# On r√©cup√®re l'historique
history_dict = history.history

loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(loss) + 1)

plt.figure(figsize=(10, 6))
plt.plot(epochs, loss, 'bo', label='Perte Entra√Ænement')
plt.plot(epochs, val_loss, 'r', label='Perte Validation') 
plt.title('Preuve de la convergence (Training vs Validation)')
plt.xlabel('√âpoques')
plt.ylabel('Erreur (MAE)')
plt.legend()
plt.show()