In [1]:
#Bibliotheques fondamentales
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Bibliotheques de machine learning
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder

#Algorithmes de classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

#Metriques evaluation
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix,classification_report,recall_score, f1_score, roc_auc_score, auc

#Sauvegarde
import joblib
import os



In [2]:
#Chargement dataset
df = pd.read_csv("candidats_recrutement.csv")

#Affichage premieres lignes
df.head()

Unnamed: 0,ID,Nom,Age,Experience,Diplome,Score_Technique,Score_Entretien,Competences,Statut
0,1,Fatou Tchou,48,27.9,Licence,100.0,24.4,Docker;ML,0
1,2,Mohamed Tchou,34,7.9,DUT,52.2,35.3,Angular,0
2,3,Fatou Momo,27,2.0,Doctorat,59.7,71.0,React,1
3,4,Mohamed Mbarga,40,21.2,BTS,81.3,76.0,React;Data Science;Spring;C++,0
4,5,Amina Momo,38,17.5,Licence,91.7,48.6,Docker;SQL,1


In [3]:
print(f"\n=== DIMENSIONS DU DATASET ===")
print(f"Nombre de lignes : {df.shape[0]}")
print(f"Nombre de colonnes : {df.shape[1]}")


=== DIMENSIONS DU DATASET ===
Nombre de lignes : 1000
Nombre de colonnes : 9


In [7]:
#Informations générales et description statistique

# %%
print("=== INFORMATIONS GÉNÉRALES ===")
print(df.info())

print("\n=== DESCRIPTION STATISTIQUE ===")
df.describe(include="all")

=== INFORMATIONS GÉNÉRALES ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               1000 non-null   int64  
 1   Nom              1000 non-null   object 
 2   Age              1000 non-null   int64  
 3   Experience       1000 non-null   float64
 4   Diplome          1000 non-null   object 
 5   Score_Technique  1000 non-null   float64
 6   Score_Entretien  1000 non-null   float64
 7   Competences      1000 non-null   object 
 8   Statut           1000 non-null   int64  
dtypes: float64(3), int64(3), object(3)
memory usage: 70.4+ KB
None

=== DESCRIPTION STATISTIQUE ===


Unnamed: 0,ID,Nom,Age,Experience,Diplome,Score_Technique,Score_Entretien,Competences,Statut
count,1000.0,1000,1000.0,1000.0,1000,1000.0,1000.0,1000,1000.0
unique,,120,,,5,,,574,
top,,Fatou Ngann,,,Licence,,,C++,
freq,,17,,,407,,,29,
mean,500.5,,37.398,16.7238,,79.4435,50.9725,,0.603
std,288.819436,,10.389588,10.583559,,19.766133,14.175716,,0.489521
min,1.0,,20.0,0.0,,25.2,10.3,,0.0
25%,250.75,,28.0,7.2,,64.2,41.1,,0.0
50%,500.5,,38.0,17.1,,83.6,51.3,,1.0
75%,750.25,,47.0,25.9,,100.0,60.6,,1.0


In [9]:
print("=== VALEURS MANQUANTES PAR COLONNE ===")
df.isnull().sum()

=== VALEURS MANQUANTES PAR COLONNE ===


ID                 0
Nom                0
Age                0
Experience         0
Diplome            0
Score_Technique    0
Score_Entretien    0
Competences        0
Statut             0
dtype: int64

In [12]:
# Identification des colonnes catégorielles
print("=== COLONNES CATÉGORIELLES ===")
categorical_columns = df.select_dtypes(include=['object']).columns
print("Colonnes catégorielles :")
for col in categorical_columns:
    unique_vals = df[col].unique()
    print(f"  - {col} : {len(unique_vals)} valeurs uniques → {unique_vals[:5]}")


=== COLONNES CATÉGORIELLES ===
Colonnes catégorielles :
  - Nom : 120 valeurs uniques → ['Fatou Tchou' 'Mohamed Tchou' 'Fatou Momo' 'Mohamed Mbarga' 'Amina Momo']
  - Diplome : 5 valeurs uniques → ['Licence' 'DUT' 'Doctorat' 'BTS' 'Master']
  - Competences : 574 valeurs uniques → ['Docker;ML' 'Angular' 'React' 'React;Data Science;Spring;C++'
 'Docker;SQL']


In [14]:
# Uniformiser les majuscules/minuscules par exemple pour éviter que “Master” et “master” soient considérés comme différents.
df['Diplome'] = df['Diplome'].str.lower()
df['Competences'] = df['Competences'].str.lower()


# Création d'une nouvelle colonne 'Nb_Competences'
# Si la cellule 'Competences' est manquante, on renvoie 0, sinon on compte le nombre d'items séparés par ';'
df['Nb_Competences'] = df['Competences'].apply(lambda s: 0 if pd.isna(s) else len(str(s).split(';')))

# Afficher les 5 premières lignes pour vérifier
df[['Competences','Nb_Competences']].head()

Unnamed: 0,Competences,Nb_Competences
0,docker;ml,2
1,angular,1
2,react,1
3,react;data science;spring;c++,4
4,docker;sql,2
