In [1]:
%matplotlib inline
import pandas as pd
from scipy import stats
import seaborn as sns

import numpy as np
from collections import Counter

In [2]:
data=pd.read_csv("data/clean.csv")
data.head()

Unnamed: 0,sexe,date_naiss,age,sit_fam,job_ok,nbre_enf,nature,design,ann_dipl,ordre,...,division_af,service_af,direction_af,date_ps,dat_retraite,poste_act,job_act,dt_echeance,tot_afct,statut_act
0,MASCULIN,26/07/1974,44.0,MARIE(E),0,1,DIPLOME,CERTIFICAT D'ETUDES PRIMAIRES ELEMENTAIRES,,1,...,,SOC,DSE,,2034.0,22TP,ECOREUR,2019.0,3,EN FONCTION
1,MASCULIN,11/06/1979,39.0,MARIE(E),1,2,CERTIFICAT,GRAPHISTE ET MAQUETTISTE,,6,...,,SOC,DSE,,2039.0,22TP,CHEF DE SITE INFORMATIQUE,2013.0,1,EN FONCTION
2,MASCULIN,11/06/1979,39.0,MARIE(E),1,2,DIPLOME,DIPLOME D'INGENIEUR EN GENIE ELECTRONIQUE MANAGER,2008.0,1,...,,SOC,DSE,,2039.0,22TP,CHEF DE SITE INFORMATIQUE,2013.0,1,EN FONCTION
3,MASCULIN,11/06/1979,39.0,MARIE(E),1,2,DIPLOME,DIPLOME DE TECHNICIEN SUPERIEUR EN ELECTRONIQUE,2000.0,2,...,,SOC,DSE,,2039.0,22TP,CHEF DE SITE INFORMATIQUE,2013.0,1,EN FONCTION
4,MASCULIN,11/06/1979,39.0,MARIE(E),1,2,CERTIFICAT,ADMINISTRATION RESEAUX-AUF JREWS,,2,...,,SOC,DSE,,2039.0,22TP,CHEF DE SITE INFORMATIQUE,2013.0,1,EN FONCTION


In [3]:
df=data.copy()

In [4]:
numeric = []
qualitative= []

for variable in df.columns.tolist():
    if df.dtypes[variable] in ['float64','int64']:
        numeric.append(variable)
    else :
        qualitative.append(variable)

##### Test de normalité

In [5]:
def shapiro_test(x):
    try :
        res = stats.shapiro(x)
    except :
        return -1
    alpha = 0.05

    print("p = ",res.pvalue)

    if res.pvalue < alpha:  # null hypothesis: x comes from a normal distribution

        print("(shapiro)The null hypothesis can be rejected -> X ne possède pas une distribution normale")

    else:

        print("(shapiro) The null hypothesis cannot be rejected -> X possède éventuellement une distribution normale")

    return res

def omnibus_normaltest(x):
    #test D'Agostino-Pearson
    try :
        k2, p = stats.normaltest(x)
    except :
        return -1

    alpha = 0.05

    print("p = ",p)

    if p < alpha:  # null hypothesis: x comes from a normal distribution

        print("(normaltest) The null hypothesis can be rejected -> X ne possède pas une distribution normale")

    else:

        print("(normaltest) The null hypothesis cannot be rejected -> X possède éventuellement une distribution normale")

    return [k2,p]

## valeur missing

In [6]:
def missing(df):
    df_null = pd.DataFrame([df.isnull().sum(),round(100*df.isnull().sum()/ len(df), 2), df.dtypes]).transpose().reset_index()
    df_null.columns = ["variable", "valeur_NA", "Pourcentage_NA", "type"]
    df_null = df_null[df_null.valeur_NA != 0].sort_values("valeur_NA",ascending = False).reset_index(drop = True)
    return df_null
missing(df)

Unnamed: 0,variable,valeur_NA,Pourcentage_NA,type
0,division_af,14744,95.49,object
1,fonction_af,10368,67.15,object
2,ann_dipl,8727,56.52,float64
3,date_ps,8524,55.2,float64
4,direction_af,8299,53.75,object
5,motif_affect,7925,51.32,object
6,service_af,6910,44.75,object
7,job_act,3040,19.69,object
8,dt_echeance,2958,19.16,float64
9,sit_fam,320,2.07,object


In [7]:
list(data.columns[data.isnull().sum() / len(data) > 0.75])

['division_af']

In [8]:
d=data.copy()
df = d.drop('division_af',axis=1)

In [9]:
d=list(data.columns[data.isnull().sum() / len(data) < 0.05])
for i in list(d):
    df = df.dropna(subset=[i], axis=0)
missing(df)

Unnamed: 0,variable,valeur_NA,Pourcentage_NA,type
0,fonction_af,10060,67.15,object
1,ann_dipl,8394,56.03,float64
2,date_ps,8380,55.93,float64
3,direction_af,8194,54.69,object
4,motif_affect,7666,51.17,object
5,service_af,6794,45.35,object
6,job_act,2965,19.79,object
7,dt_echeance,2927,19.54,float64


In [10]:
df['fonction_af'] = df['fonction_af'].fillna(df['fonction_af'].mode()[0])
df['direction_af'] = df['direction_af'].fillna(df['direction_af'].mode()[0])
df['motif_affect'] = df['motif_affect'].fillna(df['motif_affect'].mode()[0])
df['service_af'] = df['service_af'].fillna(df['service_af'].mode()[0])
df['job_act'] = df['job_act'].fillna(df['job_act'].mode()[0])

d=list(df.columns[df.isnull().sum() / len(df) < 0.05])
for i in list(d):
    df = df.dropna(subset=[i], axis=0)

from sklearn.impute import KNNImputer
df[numeric]=KNNImputer(missing_values=np.nan, n_neighbors=3).fit_transform(df[numeric])
df[numeric] = np.round(df[numeric],decimals = 0)


In [11]:
missing(df)

Unnamed: 0,variable,valeur_NA,Pourcentage_NA,type


## Valeur aberrante

In [12]:
# Outlier detection

def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []

    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1

        # outlier step
        outlier_step = 1.5 * IQR

        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index

        # append the found outlier indices for col to the list of outlier indices
        outlier_indices.extend(outlier_list_col)

    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )

    return multiple_outliers

Outliers_to_drop = detect_outliers(df,1,numeric)
df.drop(Outliers_to_drop,0,inplace=True)
len(Outliers_to_drop)

  df.drop(Outliers_to_drop,0,inplace=True)


1656

## Teste de normalite

In [13]:
# Shapiro-Wilk Test
from scipy.stats import shapiro
def Shapiro(d):
    stat, p = shapiro(d)
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    # interpret
    alpha = 0.05
    if p > alpha:
        print('Sample looks Gaussian (fail to reject H0)')
    else:
        print('Sample does not look Gaussian (reject H0)')

for i in numeric:
    print(i)
    Shapiro(df[i])

age
Statistics=0.972, p=0.000
Sample does not look Gaussian (reject H0)
job_ok
Statistics=0.096, p=0.000
Sample does not look Gaussian (reject H0)
nbre_enf
Statistics=0.886, p=0.000
Sample does not look Gaussian (reject H0)
ann_dipl
Statistics=0.083, p=0.000
Sample does not look Gaussian (reject H0)
ordre
Statistics=0.859, p=0.000
Sample does not look Gaussian (reject H0)
annee_ent
Statistics=0.918, p=0.000
Sample does not look Gaussian (reject H0)
date_ent
Statistics=0.918, p=0.000
Sample does not look Gaussian (reject H0)
date_ps
Statistics=0.909, p=0.000
Sample does not look Gaussian (reject H0)
dat_retraite
Statistics=0.972, p=0.000
Sample does not look Gaussian (reject H0)
dt_echeance
Statistics=0.791, p=0.000
Sample does not look Gaussian (reject H0)
tot_afct
Statistics=0.931, p=0.000
Sample does not look Gaussian (reject H0)




In [14]:
from scipy.stats import normaltest
def agostino_test(d):
    stat, p = normaltest(d)
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    # interpret
    alpha = 0.05
    if p > alpha:
        print('Sample looks Gaussian (fail to reject H0)')
    else:
        print('Sample does not look Gaussian (reject H0)')

for i in numeric:
    agostino_test(df[i])

Statistics=588.311, p=0.000
Sample does not look Gaussian (reject H0)
Statistics=17929.545, p=0.000
Sample does not look Gaussian (reject H0)
Statistics=2508.015, p=0.000
Sample does not look Gaussian (reject H0)
Statistics=34215.770, p=0.000
Sample does not look Gaussian (reject H0)
Statistics=2940.494, p=0.000
Sample does not look Gaussian (reject H0)
Statistics=441.773, p=0.000
Sample does not look Gaussian (reject H0)
Statistics=441.894, p=0.000
Sample does not look Gaussian (reject H0)
Statistics=2140.178, p=0.000
Sample does not look Gaussian (reject H0)
Statistics=604.876, p=0.000
Sample does not look Gaussian (reject H0)
Statistics=4205.323, p=0.000
Sample does not look Gaussian (reject H0)
Statistics=2685.937, p=0.000
Sample does not look Gaussian (reject H0)


In [15]:
p_values = {}
for col in numeric:
    print("\n",col)
    Shapiro(np.log(df[col].dropna()))


 age
Statistics=0.987, p=0.000
Sample does not look Gaussian (reject H0)

 job_ok
Statistics=nan, p=1.000
Sample looks Gaussian (fail to reject H0)

 nbre_enf
Statistics=nan, p=1.000
Sample looks Gaussian (fail to reject H0)

 ann_dipl
Statistics=0.020, p=0.000
Sample does not look Gaussian (reject H0)

 ordre
Statistics=0.926, p=0.000
Sample does not look Gaussian (reject H0)

 annee_ent
Statistics=0.874, p=0.000
Sample does not look Gaussian (reject H0)

 date_ent
Statistics=0.874, p=0.000
Sample does not look Gaussian (reject H0)

 date_ps
Statistics=0.800, p=0.000
Sample does not look Gaussian (reject H0)

 dat_retraite
Statistics=0.957, p=0.000
Sample does not look Gaussian (reject H0)

 dt_echeance
Statistics=0.666, p=0.000
Sample does not look Gaussian (reject H0)

 tot_afct
Statistics=nan, p=1.000
Sample looks Gaussian (fail to reject H0)


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [16]:
len(df[qualitative[4]].unique())

1282

In [17]:
formation=['ECONOMIE CIRCULAIRE ET DEVELOPPEMENT DURABLE','CFEPENN','2EME REUNION DES EXPERTS DE','AGOA','FORMATION','SCANNER','RAPPROCHEMENT DES DONNEES COMPTABLES','RENFORCEMENT','SEMINAIRE','LUTTE','SURVEILLANCE','ATELIER','TECHNIQUE D\'ENQUETE','SCANNER','DACTYLOGRAPHIE','FOOD SAFETY INSPECTION TECHNOLOGY','COMMUNICATION INTERPERSONNELLE','ACCUEIL DES USAGERS','SURETE AEROPORTUAIRE','ATELIER SUR LES NEGOCIATIONS SUR LA ZLET TRIPARTITE','VALEUR TRANSACTIONNELLE','COMPTABILITE','BAE','MECANICIEN','CONTROLE A POSTERIORI','OPERATEUR IMAGE SCANNER','TABLEAU DE BORD','CORRESPONDANCE ADMINISTRATIVE','ATTACHE D\'ADMINISTRATION','REGLES D\'ORIGINE','SECRETARIAT','TEAM MANAGEMENT','DEVELOPPEMENT PERSONNEL','GESTION AXEE SUR LES RESULTATS','SECRETARIAT','MISE A NIVEAU','MECANIQUE AUTOMOBILE','MANAGEMENT ET LEADERSHIP','BUREAUTIQUE','LEADERSHIP','FORMATION']
MAITRISE=['DUES','4E ANNEE','BACC+4','BACC+5','SECOND CYCLE',"DIPLOME D'ETUDES PROFESSIONNELLES APPROFONDIES",'4EME ANNEE','MASTER','MAITRISE','ETHIQUE ET DEONTOLOGIE','DEA','MAGISTERE','INGENIORAT','INGENIEUR','2ND CYCLE','BACCALAUREAT+4']
LICENCE =['3EME ANNEE','BACC+3','LICENCE']
BREVET=['BREVET','CFEPCES','PREBAC','PRE BAC','BEPC']
PRIMAIRE=['CAE/EB','CEPE','CERTIFICAT D\'ETUDES PRIMAIRES ELEMENTAIRES']
DOUANE=['SYDONIA++','PROCEDURE DE DEDOUANEMENT','DOUAN','TRANSIT']
LANGUES=['B1-B2','ANGLAIS','FRANCAIS','DELF','ENGLISH','MTCP']
INFORMATIQUE=['ANALYSE D\'IMAGE','ATTESTATION','ELECTRONIQUE','ELECTROTECHNIQUE','PROGRAMMATION','DEVELOPPEUR','SOFTWARE','BASE DE DONNEE','ORACLE','EXCEL','WEB','LINUX','GRAPHISTE','RESEAU','INFORMATIQUE','PROGRAMMEUR']
PERMIS=['PERMIS','MAINTENANCE AUTOMOBILE','CHAUFFEUR','MECANIQUE AUTO','CONDUCTEUR']
BACC=['BACC','BACALAUREAT']
SUPERIEUR=['BT','BATIMENT','COMPTABLE','AUDIT INTERNE','DROIT','GESTION','DROIT / GEOGRAPHIE','MANAGEMENT','DIPLOME','INFA','IST-T','COMMUNICATION','SUPERIEUR','UNIVERSITAIRE','COACHING','CHEF','BANCAIRES','INTERNATIONALES','ETUDES APPROFONDIES','SERVICE NATIONAL HORS FORCES ARMEES','RADIOPROTECTION','ETUDES JUDICIAIRES','CONTROLE A LA CIRCULATION','MARCHE PUBLIC','ENVIRONNEMENT','OZONE','SURETE AEROPORTUAIRE ET FRAUDE DOCUMENTAIRE','SUBSTANCES APPAUVRISSANT LA COUCHE D\'OZONE','COMMERCE INTERNATIONAL','CITES','CONTROLE DES SUBSTANCES APPAUVRISSANT LA COUCHE D\'OZONE','DIPLOME D\'ATTACHE D\'ADMINISTRATION']
DTS=['2E ANNEE','DEUG','2EME ANNEE','BACC+2','DTS']
Autres=['FORMATION','MAITRISE','LICENCE','BREVET','PRIMAIRE','DOUANE','LANGUES','INFORMATIQUE','PERMIS','BACC','SUPERIEUR','DTS']
dat = list(df[qualitative[4]])
for index, value in enumerate(dat):
    for i in DTS:
        if i in str(value):
            dat[index]='DTS'
    for i in BACC:
        if i in str(value):
            dat[index]='BACC'
    for i in SUPERIEUR:
        if i in str(value):
            dat[index]='SUPERIEUR'
    for i in PERMIS:
        if i in str(value):
            dat[index]='PERMIS'
    for i in INFORMATIQUE:
        if i in str(value):
            dat[index]='INFORMATIQUE'
    for i in LANGUES:
        if i in str(value):
            dat[index]='LANGUES'
    for i in DOUANE:
        if i in str(value):
            dat[index]='DOUANE'
    for i in PRIMAIRE:
        if i in str(value):
            dat[index]='PRIMAIRE'
    for i in BREVET:
        if i in str(value):
            dat[index]='BREVET'
    for i in LICENCE:
        if i in str(value):
            dat[index]='LICENCE'
    for i in formation:
        if str(i) in str(value):
            dat[index]='FORMATION'
    for i in MAITRISE:
        if i in str(value):
            dat[index]='MAITRISE'
    if 'CERTIFICAT' in str(value):
        dat[index]='CERTIFICAT'
    if 'DOCTORAT' in str(value):
        dat[index]='DOCTORAT'

df[qualitative[4]] =  dat
d = list(df[qualitative[4]])
for index, value in enumerate(d):
    if  str(value) not in Autres:
        d[index]='Autres'
df[qualitative[4]] =  d


In [18]:
b=[ 'date_naiss','mode_ent',
 'corps_ent',
 'poste_ent',
 'lieu_af',
 'motif_affect',
 'fonction_af',
 'service_af',
 'direction_af',
 'poste_act',
 'job_act',
 'design',
  'date_ent']
d=df.copy()
df = d.drop(b,axis=1)

In [19]:
categorical_cols =['sexe', 'sit_fam', 'nature', 'statut_act']
data=df.copy()
df=pd.get_dummies(data, columns=categorical_cols)

## log

In [20]:
numeric

['age',
 'job_ok',
 'nbre_enf',
 'ann_dipl',
 'ordre',
 'annee_ent',
 'date_ent',
 'date_ps',
 'dat_retraite',
 'dt_echeance',
 'tot_afct']

In [21]:
df['log_age']=np.log(df['age']+1)
df['log_job_ok']=np.log(df['job_ok']+1)
df['log_nbre_enf']=np.log(df['nbre_enf']+1)
df['log_ordre']=np.log(df['ordre']+1)
df['log_annee_ent']=np.log(df['annee_ent']+1)
df['log_date_ps']=np.log(df['date_ps']+1)
df['log_dat_retraite']=np.log(df['dat_retraite']+1)
df['log_dt_echeance']=np.log(df['dt_echeance']+1)
df['log_tot_afct']=np.log(df['tot_afct']+1)

## Standardisation

In [22]:
df['std_age'] = (df['age'] - np.mean(df['age']))/np.std(df['age'])
df['std_job_ok'] = (df['job_ok'] - np.mean(df['job_ok']))/np.std(df['job_ok'])
df['std_nbre_enf'] = (df['nbre_enf'] - np.mean(df['nbre_enf']))/np.std(df['nbre_enf'])
df['std_ordre'] = (df['ordre'] - np.mean(df['ordre']))/np.std(df['ordre'])
df['std_annee_ent'] = (df['annee_ent'] - np.mean(df['annee_ent']))/np.std(df['annee_ent'])
df['std_date_ps'] = (df['date_ps'] - np.mean(df['date_ps']))/np.std(df['date_ps'])
df['std_dat_retraite'] = (df['dat_retraite'] - np.mean(df['dat_retraite']))/np.std(df['dat_retraite'])
df['std_dt_echeance'] = (df['dt_echeance'] - np.mean(df['dt_echeance']))/np.std(df['dt_echeance'])
df['std_tot_afct'] = (df['tot_afct'] - np.mean(df['tot_afct']))/np.std(df['tot_afct'])

In [23]:
df.to_csv(r'data/preproce.csv',index=False)