In [25]:
# import des librairies dont nous aurons besoin
import pandas as pd
import numpy as np
import re

# chargement et affichage des données
data = pd.read_csv('personnes.csv')
print(data)

   prenom                                   email date_naissance  \
0   Leila                       leila@example.com     23/01/1990   
1  Samuel                  samuel_329@example.com     20/09/2001   
2   Radia                choupipoune@supermail.eu  12 sept. 1984   
3    Marc  marco23@example.com, mc23@supermail.eu     10/02/1978   
4    Heri                 helloworld@supermail.eu     05/03/2008   
5   Hanna                  hanna2019@supermail.eu     01/01/1970   
6  samuël                  samuel_329@example.com            NaN   

            pays taille  
0         France  1.49m  
1            NaN  1.67m  
2  Côte d'ivoire  153cm  
3         France  1.65m  
4     Madagascar  1.34m  
5             24  3.45m  
6          Bénin  1.45m  


## 1 - Valeurs manquante

In [2]:
print(data.isnull().sum())

prenom            0
email             0
date_naissance    1
pays              1
taille            0
dtype: int64


In [3]:
print(data.isnull())

   prenom  email  date_naissance   pays  taille
0   False  False           False  False   False
1   False  False           False   True   False
2   False  False           False  False   False
3   False  False           False  False   False
4   False  False           False  False   False
5   False  False           False  False   False
6   False  False            True  False   False


## 2 - Valeurs extremes

## 3 - Doublons 

In [5]:
data.loc[data['email'].duplicated(keep=False),:]

Unnamed: 0,prenom,email,date_naissance,pays,taille
1,Samuel,samuel_329@example.com,20/09/2001,,1.67m
6,samuël,samuel_329@example.com,,Bénin,1.45m


In [6]:
data['email'].duplicated(keep=False)

0    False
1     True
2    False
3    False
4    False
5    False
6     True
Name: email, dtype: bool

In [7]:
data['email'].duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
Name: email, dtype: bool

In [9]:
data.loc[data['email'].duplicated(keep=False),]

Unnamed: 0,prenom,email,date_naissance,pays,taille
1,Samuel,samuel_329@example.com,20/09/2001,,1.67m
6,samuël,samuel_329@example.com,,Bénin,1.45m


## Modifier une variable (Colonne)

## 4 - Erreurs lexicales

In [26]:
VALID_COUNTRIES = ['France', 'Côte d\'ivoire', 'Madagascar', 'Bénin', 'Allemagne'
                  , 'USA']
mask = ~data['pays'].isin(VALID_COUNTRIES)
data.loc[mask, 'pays'] = np.NaN

In [27]:
data ##

Unnamed: 0,prenom,email,date_naissance,pays,taille
0,Leila,leila@example.com,23/01/1990,France,1.49m
1,Samuel,samuel_329@example.com,20/09/2001,,1.67m
2,Radia,choupipoune@supermail.eu,12 sept. 1984,Côte d'ivoire,153cm
3,Marc,"marco23@example.com, mc23@supermail.eu",10/02/1978,France,1.65m
4,Heri,helloworld@supermail.eu,05/03/2008,Madagascar,1.34m
5,Hanna,hanna2019@supermail.eu,01/01/1970,,3.45m
6,samuël,samuel_329@example.com,,Bénin,1.45m


## 5 - Erreurs de formattage

In [28]:
data['email'] 

0                         leila@example.com
1                    samuel_329@example.com
2                  choupipoune@supermail.eu
3    marco23@example.com, mc23@supermail.eu
4                   helloworld@supermail.eu
5                    hanna2019@supermail.eu
6                    samuel_329@example.com
Name: email, dtype: object

In [29]:
data['email'].str.split(',')

0                          [leila@example.com]
1                     [samuel_329@example.com]
2                   [choupipoune@supermail.eu]
3    [marco23@example.com,  mc23@supermail.eu]
4                    [helloworld@supermail.eu]
5                     [hanna2019@supermail.eu]
6                     [samuel_329@example.com]
Name: email, dtype: object

In [30]:
data['email'].str.split(',', n=1, expand=True)[0]

0           leila@example.com
1      samuel_329@example.com
2    choupipoune@supermail.eu
3         marco23@example.com
4     helloworld@supermail.eu
5      hanna2019@supermail.eu
6      samuel_329@example.com
Name: 0, dtype: object

In [31]:
data['email'] = data['email'].str.split(',', n=1, expand=True)[0]

In [32]:
data

Unnamed: 0,prenom,email,date_naissance,pays,taille
0,Leila,leila@example.com,23/01/1990,France,1.49m
1,Samuel,samuel_329@example.com,20/09/2001,,1.67m
2,Radia,choupipoune@supermail.eu,12 sept. 1984,Côte d'ivoire,153cm
3,Marc,marco23@example.com,10/02/1978,France,1.65m
4,Heri,helloworld@supermail.eu,05/03/2008,Madagascar,1.34m
5,Hanna,hanna2019@supermail.eu,01/01/1970,,3.45m
6,samuël,samuel_329@example.com,,Bénin,1.45m


## 6 - Erreur d'irrégularité

In [34]:
data['taille'] = data['taille'].str[:-1]
data['taille'] = pd.to_numeric(data['taille'], errors='coerce')

In [35]:
data['taille'] 

0    1.49
1    1.67
2     NaN
3    1.65
4    1.34
5    3.45
6    1.45
Name: taille, dtype: float64

In [36]:
data.loc[data['taille'].isnull(), 'taille'] = data['taille'].mean()

In [37]:
data['taille']

0    1.490000
1    1.670000
2    1.841667
3    1.650000
4    1.340000
5    3.450000
6    1.450000
Name: taille, dtype: float64

## 7 -  Erreurs de formattage

In [38]:
data['date_naissance']

0       23/01/1990
1       20/09/2001
2    12 sept. 1984
3       10/02/1978
4       05/03/2008
5       01/01/1970
6              NaN
Name: date_naissance, dtype: object

In [39]:
data['date_naissance'] = pd.to_datetime(data['date_naissance'], format='%d/%m/%Y', errors='coerce')

In [40]:
data['date_naissance']

0   1990-01-23
1   2001-09-20
2          NaT
3   1978-02-10
4   2008-03-05
5   1970-01-01
6          NaT
Name: date_naissance, dtype: datetime64[ns]