In [7]:
# =================================================================
# 1. INSTALLATION ET IMPORTATIONS
# =================================================================
# !pip install pandas sklearn gradio openpyxl

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import gradio as gr

# =================================================================
# 2. CHARGEMENT ET NETTOYAGE DE LA BDD
# =================================================================
# Chargement du fichier
df = pd.read_excel('rh_data.xlsx')

# --- NETTOYAGE ---
# A. Suppression des doublons
df = df.drop_duplicates()

# B. Correction du texte (Espaces et Majuscules)
df['Salaire_Saisie'] = df['Salaire_Saisie'].str.strip().str.capitalize()

# C. Gestion des valeurs manquantes (remplacement par la moyenne/m√©diane)
df['Satisfaction'] = df['Satisfaction'].fillna(df['Satisfaction'].mean())
df['Heures_Mensuelles'] = df['Heures_Mensuelles'].fillna(df['Heures_Mensuelles'].median())

# D. Suppression des valeurs aberrantes (Outliers)
df = df[(df['Satisfaction'] >= 0) & (df['Satisfaction'] <= 1)]
df = df[df['Heures_Mensuelles'] <= 744] # Max heures dans un mois

# =================================================================
# 3. TRAITEMENT POUR LE MACHINE LEARNING (PREPROCESSING)
# =================================================================
# A. Encodage : Texte -> Chiffres
df['Salaire_Saisie'] = df['Salaire_Saisie'].map({'Bas': 0, 'Moyen': 1, 'Haut': 2})

# B. S√©paration Caract√©ristiques (X) et Cible (y)
X = df.drop('Demission', axis=1)
y = df['Demission']

# C. Division Entra√Ænement (80%) et Test (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# D. Normalisation (Scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# =================================================================
# 4. ENTRA√éNEMENT DU MOD√àLE (RANDOM FOREST)
# =================================================================
model_rh = RandomForestClassifier(n_estimators=100, random_state=42)
model_rh.fit(X_train_scaled, y_train)

# Evaluation rapide dans la console
y_pred = model_rh.predict(X_test_scaled)
print("\n--- RAPPORT DE PERFORMANCE ---")
print(classification_report(y_test, y_pred))



# =================================================================
# 5. INTERFACE UTILISATEUR (GRADIO)
# =================================================================
def diagnostic_employe(satisfaction, eval, projets, heures, anciennete, salaire):
    # 1. Conversion du salaire de l'interface
    salaire_map = {'Bas': 0, 'Moyen': 1, 'Haut': 2}
    s_num = salaire_map[salaire]
    
    # 2. Cr√©ation du tableau pour le mod√®le
    entree = np.array([[satisfaction, eval, projets, heures, anciennete, s_num]])
    
    # 3. Application du m√™me Scaling que l'entra√Ænement
    entree_scaled = scaler.transform(entree)
    
    # 4. Pr√©diction
    prediction = model_rh.predict(entree_scaled)[0]
    probabilite = model_rh.predict_proba(entree_scaled)[0][1]
    
    if prediction == 1:
        return f"‚ö†Ô∏è RISQUE DE D√âMISSION √âLEV√â : {probabilite*100:.1f}%"
    else:
        return f"‚úÖ EMPLOY√â STABLE : Risque de seulement {probabilite*100:.1f}%"

# Cr√©ation de l'interface visuelle
demo = gr.Interface(
    fn=diagnostic_employe,
    inputs=[
        gr.Slider(0, 1, value=0.5, label="Taux de Satisfaction (0 √† 1)"),
        gr.Slider(0, 1, value=0.5, label="Derni√®re √âvaluation (0 √† 1)"),
        gr.Number(value=3, label="Nombre de Projets"),
        gr.Number(value=160, label="Heures travaill√©es / mois"),
        gr.Number(value=2, label="Anciennet√© (Ann√©es)"),
        gr.Dropdown(['Bas', 'Moyen', 'Haut'], value='Moyen', label="Niveau de Salaire")
    ],
    outputs="text",
    title="üè¢ IA de R√©tention RH",
    description="Entrez les param√®tres d'un employ√© pour pr√©dire s'il va quitter l'entreprise."
)

# Lancer l'interface
demo.launch()


--- RAPPORT DE PERFORMANCE ---
              precision    recall  f1-score   support

           0       0.81      0.95      0.87        62
           1       0.89      0.63      0.74        38

    accuracy                           0.83       100
   macro avg       0.85      0.79      0.81       100
weighted avg       0.84      0.83      0.82       100

* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.






In [5]:
!pip install openpyxl



In [6]:
!pip install pandas sklearn gradio openpyxl

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  python setup.py egg_info did not run successfully.
  exit code: 1
  
  [15 lines of output]
  The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  rather than 'sklearn' for pip commands.
  
  Here is how to fix this error in the main use cases:
  - use 'pip install scikit-learn' rather than 'pip install sklearn'
  - replace 'sklearn' by 'scikit-learn' in your pip requirements files
    (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  - if the 'sklearn' package is used by one of your dependencies,
    it would be great if you take some time to track which package uses
    'sklearn' instead of 'scikit-learn' and report it to their issue tracker
  - as a last resort, set the environment variable
    SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
  
  More information is available at
  https://github.com/scikit-learn/sklearn-pypi-package
  [end of output]
  
  note: This error originates from a subpr

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. R√©cup√©rer l'importance des variables depuis le mod√®le entra√Æn√©
importances = model_rh.feature_importances_
labels = X.columns

# 2. Cr√©er un tableau pour faciliter l'affichage
feature_df = pd.DataFrame({'Crit√®re': labels, 'Importance': importances})
feature_df = feature_df.sort_values(by='Importance', ascending=False)

# 3. Affichage du graphique
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Crit√®re', data=feature_df, hue='Crit√®re', palette='magma', legend=False)

plt.title('Classement des crit√®res influen√ßant la d√©mission')
plt.xlabel('Niveau d\'importance (0 √† 1)')
plt.ylabel('Crit√®res')
plt.show()

# Affichage textuel des r√©sultats
print("Classement d√©taill√© :")
print(feature_df)

<Figure size 1000x600 with 1 Axes>

Classement d√©taill√© :
             Crit√®re  Importance
0       Satisfaction    0.484652
1      Derniere_Eval    0.174075
3  Heures_Mensuelles    0.155814
4  Anciennete_Annees    0.085554
2         Nb_Projets    0.062809
5     Salaire_Saisie    0.037097
