# Traiter les valeurs manquantes

In [1]:
import pandas as pd
titanic_survival = pd.read_csv("titanic_survival.csv")
titanic_survival.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### Trouver les valeurs manquantes

In [2]:
#pandas.isnull()

In [3]:
sex = titanic_survival['sex']
sex_is_null = pd.isnull(sex)
print(sex_is_null)

0       False
1       False
2       False
3       False
4       False
        ...  
1305    False
1306    False
1307    False
1308    False
1309     True
Name: sex, Length: 1310, dtype: bool


In [4]:
sex_null=sex[sex_is_null]
print(sex_null)

1309    NaN
Name: sex, dtype: object


In [5]:
age_is_null = pd.isnull(titanic_survival['age'])
val_age_manquantes = (titanic_survival['age'])[age_is_null]
val_age_manquantes

15     NaN
37     NaN
40     NaN
46     NaN
59     NaN
        ..
1297   NaN
1302   NaN
1303   NaN
1305   NaN
1309   NaN
Name: age, Length: 264, dtype: float64

In [6]:
len((titanic_survival['age'])[age_is_null])

264

### Probleme avec les valeur manquantes 

In [7]:
## On ne peut pas calculer la moyenne quand on a des valeurs manquantes 
mean_age = sum(titanic_survival['age'])/len(titanic_survival['age'])
mean_age

nan

In [8]:
## le vrai calcule de la moyenne de age :
age = titanic_survival['age']
age_filtre = []
age_is_null = pd.isnull(titanic_survival['age'])
for i in range(0,len(age_is_null)):
    if(age_is_null[i]== False):
        age_filtre.append(age[i])
        
print(sum(age_filtre)/len(age_filtre))
#mean_age = sum(vrai_val_age)/len(vrai_val_age)

29.8811345124283


Méthode plus rapide :

In [9]:
age_is_null = pd.isnull(titanic_survival['age'])
goode_ages = titanic_survival['age'][age_is_null==False]
print(sum(goode_ages)/len(goode_ages))

29.8811345124283


### Moyen plus simple pour calculer une moyenne : 

In [10]:
#Series.mean()
mean_age=titanic_survival['age'].mean()
mean_age
#Dans ce cas pas besoin de supprimer les valeur manquantes 

29.8811345124283

In [11]:
mean_fare=titanic_survival['fare'].mean()
mean_fare   

33.29547928134572

### Calculer des statistiques de prix 

In [12]:
#Ma solution :
fares_by_class = {}
passenger_classes=[1,2,3]
for i in passenger_classes:
    pclass_lignes =  titanic_survival[titanic_survival['pclass']== i]
    pclass_fares =  pclass_lignes['fare'] 
    pclass_fares_mean = pclass_fares.mean()
    fares_by_class[i]= pclass_fares_mean
    
            
print(fares_by_class)
            
 

{1: 87.50899164086687, 2: 21.1791963898917, 3: 13.302888700564957}


In [13]:
 #Solution prof
fares_by_class = {}
passenger_classes=[1,2,3]
for this_class in passenger_classes:
    pclass_rows = titanic_survival[titanic_survival['pclass']== this_class]
    pclass_fares = pclass_rows['fare']
    fare_for_class = pclass_fares.mean()
    fares_by_class[this_class]= fare_for_class
    
print(fares_by_class)


{1: 87.50899164086687, 2: 21.1791963898917, 3: 13.302888700564957}


### Introduction sur le Pivot de Table

In [14]:
# dataFrame.pivot_table()

In [15]:
import numpy
passenger_class_fares = titanic_survival.pivot_table(index="pclass", values="fare",aggfunc=numpy.mean)
#index => quelle  colonne on souhaite grouper
#values=> la colonne sur laquelle nous appliqons le calcule 
#aggfunc => précise le calcule qu'on souhaite appliquer (par defaut mean)

In [16]:
print(passenger_class_fares)

             fare
pclass           
1.0     87.508992
2.0     21.179196
3.0     13.302889


  Exercice :

calculer la moyenne d'age pour chaque classe : 

In [17]:
passenger_class_ages = titanic_survival.pivot_table(index="pclass", values="age",aggfunc=numpy.mean)
print(passenger_class_ages)

              age
pclass           
1.0     39.159918
2.0     29.506705
3.0     24.816367


In [18]:
passenger_class_survived = titanic_survival.pivot_table(index="pclass", values="survived",aggfunc=numpy.mean)
print(passenger_class_survived)

        survived
pclass          
1.0     0.619195
2.0     0.429603
3.0     0.255289


### Table Pivot Niveau 2

In [19]:
port_stats = titanic_survival.pivot_table(index="embarked", values=["fare","survived"] ,aggfunc=numpy.sum)
print(port_stats)

                fare  survived
embarked                      
C         16830.7922     150.0
Q          1526.3085      44.0
S         25033.3862     304.0


### Eliminer les valeurs manquantes 

In [20]:
#DataFrame.dropna(axis = 0 ou axis='index') 

In [21]:
drop_na_rows = titanic_survival.dropna(axis=0)
##supprimer toute lignes presantant au moins une valeur manquante 
drop_na_rows 

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest


In [22]:
drop_na_columns = titanic_survival.dropna(axis=1)
drop_na_columns
#exclut toute les colonnes de titanic_survival qui ont des valeurs nulls

0
1
2
3
4
...
1305
1306
1307
1308
1309


In [23]:
#dropna[axis=.., subset=["name"]]

In [24]:
titanic_survival.dropna(axis=0, subset=["name"] )

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3.0,0.0,"Zabour, Miss. Hileni",female,14.5000,1.0,0.0,2665,14.4542,,C,,328.0,
1305,3.0,0.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,
1306,3.0,0.0,"Zakarian, Mr. Mapriededer",male,26.5000,0.0,0.0,2656,7.2250,,C,,304.0,
1307,3.0,0.0,"Zakarian, Mr. Ortin",male,27.0000,0.0,0.0,2670,7.2250,,C,,,


In [25]:
titanic_survival.dropna(axis= 0, subset=["age" , "sex"] )

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1301,3.0,0.0,"Youseff, Mr. Gerious",male,45.5000,0.0,0.0,2628,7.2250,,C,,312.0,
1304,3.0,0.0,"Zabour, Miss. Hileni",female,14.5000,1.0,0.0,2665,14.4542,,C,,328.0,
1306,3.0,0.0,"Zakarian, Mr. Mapriededer",male,26.5000,0.0,0.0,2656,7.2250,,C,,304.0,
1307,3.0,0.0,"Zakarian, Mr. Ortin",male,27.0000,0.0,0.0,2670,7.2250,,C,,,


In [26]:
print(titanic_survival.shape)

(1310, 14)


### iloc pour accéder à des lignes

In [27]:
#DataFrame.loc[] => selectionne les lignes

In [28]:
new_titanic_survival = titanic_survival.sort_values("age", inplace=False, ascending=False)
new_titanic_survival

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
14,1.0,1.0,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0.0,0.0,27042,30.0000,A23,S,B,,"Hessle, Yorks"
61,1.0,1.0,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0,1.0,0.0,19877,78.8500,C46,S,6,,"Little Onn Hall, Staffs"
1235,3.0,0.0,"Svensson, Mr. Johan",male,74.0,0.0,0.0,347060,7.7750,,S,,,
135,1.0,0.0,"Goldschmidt, Mr. George B",male,71.0,0.0,0.0,PC 17754,34.6542,A5,C,,,"New York, NY"
9,1.0,0.0,"Artagaveytia, Mr. Ramon",male,71.0,0.0,0.0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,3.0,0.0,"Wiseman, Mr. Phillippe",male,,0.0,0.0,A/4. 34244,7.2500,,S,,,
1302,3.0,0.0,"Yousif, Mr. Wazli",male,,0.0,0.0,2647,7.2250,,C,,,
1303,3.0,0.0,"Yousseff, Mr. Gerious",male,,0.0,0.0,2627,14.4583,,C,,,
1305,3.0,0.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,


In [29]:
new_titanic_survival.loc[0]
## ce n'est pas "Barkworth, Mr. Algernon Henry Wilson"

pclass                                   1
survived                                 1
name         Allen, Miss. Elisabeth Walton
sex                                 female
age                                     29
sibsp                                    0
parch                                    0
ticket                               24160
fare                               211.338
cabin                                   B5
embarked                                 S
boat                                     2
body                                   NaN
home.dest                     St Louis, MO
Name: 0, dtype: object

In [30]:
new_titanic_survival.iloc[0]

pclass                                          1
survived                                        1
name         Barkworth, Mr. Algernon Henry Wilson
sex                                          male
age                                            80
sibsp                                           0
parch                                           0
ticket                                      27042
fare                                           30
cabin                                         A23
embarked                                        S
boat                                            B
body                                          NaN
home.dest                           Hessle, Yorks
Name: 14, dtype: object

In [31]:
new_titanic_survival.iloc[0:5]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
14,1.0,1.0,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0.0,0.0,27042,30.0,A23,S,B,,"Hessle, Yorks"
61,1.0,1.0,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0,1.0,0.0,19877,78.85,C46,S,6,,"Little Onn Hall, Staffs"
1235,3.0,0.0,"Svensson, Mr. Johan",male,74.0,0.0,0.0,347060,7.775,,S,,,
135,1.0,0.0,"Goldschmidt, Mr. George B",male,71.0,0.0,0.0,PC 17754,34.6542,A5,C,,,"New York, NY"
9,1.0,0.0,"Artagaveytia, Mr. Ramon",male,71.0,0.0,0.0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


Exercice :

In [32]:
first_ten_rows = new_titanic_survival.iloc[0:5]
first_ten_rows

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
14,1.0,1.0,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0.0,0.0,27042,30.0,A23,S,B,,"Hessle, Yorks"
61,1.0,1.0,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0,1.0,0.0,19877,78.85,C46,S,6,,"Little Onn Hall, Staffs"
1235,3.0,0.0,"Svensson, Mr. Johan",male,74.0,0.0,0.0,347060,7.775,,S,,,
135,1.0,0.0,"Goldschmidt, Mr. George B",male,71.0,0.0,0.0,PC 17754,34.6542,A5,C,,,"New York, NY"
9,1.0,0.0,"Artagaveytia, Mr. Ramon",male,71.0,0.0,0.0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"


In [33]:
row_position_fifth = first_ten_rows.iloc[4]
row_position_fifth

pclass                             1
survived                           0
name         Artagaveytia, Mr. Ramon
sex                             male
age                               71
sibsp                              0
parch                              0
ticket                      PC 17609
fare                         49.5042
cabin                            NaN
embarked                           C
boat                             NaN
body                              22
home.dest        Montevideo, Uruguay
Name: 9, dtype: object

In [34]:
row_index_25= new_titanic_survival.iloc[24]
row_index_25

pclass                               2
survived                             0
name         Myles, Mr. Thomas Francis
sex                               male
age                                 62
sibsp                                0
parch                                0
ticket                          240276
fare                            9.6875
cabin                              NaN
embarked                             Q
boat                               NaN
body                               NaN
home.dest                Cambridge, MA
Name: 511, dtype: object

### Les index de colonne

In [35]:
new_titanic_survival.iloc[0,0]

1.0

In [36]:
new_titanic_survival.iloc[:,0:3]

Unnamed: 0,pclass,survived,name
14,1.0,1.0,"Barkworth, Mr. Algernon Henry Wilson"
61,1.0,1.0,"Cavendish, Mrs. Tyrell William (Julia Florence..."
1235,3.0,0.0,"Svensson, Mr. Johan"
135,1.0,0.0,"Goldschmidt, Mr. George B"
9,1.0,0.0,"Artagaveytia, Mr. Ramon"
...,...,...,...
1297,3.0,0.0,"Wiseman, Mr. Phillippe"
1302,3.0,0.0,"Yousif, Mr. Wazli"
1303,3.0,0.0,"Yousseff, Mr. Gerious"
1305,3.0,0.0,"Zabour, Miss. Thamine"


In [37]:
new_titanic_survival.loc[83,"age"]

64.0

In [38]:
new_titanic_survival.loc[766,"pclass"]

3.0

In [39]:
##Remarque: pas de "chaine de caractere " dans les crochets de loc 

Exercice

In [40]:
row_index_1100_age = new_titanic_survival.loc[1100,"age"]
row_25_survived = new_titanic_survival.loc[25,"survived"]
five_rows_three_cols = new_titanic_survival.iloc[0:5,0:3]
#iloc[0:5,0:3] 5 et 3 ne sont pas inclus 
 

In [41]:
 five_rows_three_cols

Unnamed: 0,pclass,survived,name
14,1.0,1.0,"Barkworth, Mr. Algernon Henry Wilson"
61,1.0,1.0,"Cavendish, Mrs. Tyrell William (Julia Florence..."
1235,3.0,0.0,"Svensson, Mr. Johan"
135,1.0,0.0,"Goldschmidt, Mr. George B"
9,1.0,0.0,"Artagaveytia, Mr. Ramon"


### Réindexer les lignes d'un DataFrame

In [42]:
new_titanic_survival

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
14,1.0,1.0,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0.0,0.0,27042,30.0000,A23,S,B,,"Hessle, Yorks"
61,1.0,1.0,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0,1.0,0.0,19877,78.8500,C46,S,6,,"Little Onn Hall, Staffs"
1235,3.0,0.0,"Svensson, Mr. Johan",male,74.0,0.0,0.0,347060,7.7750,,S,,,
135,1.0,0.0,"Goldschmidt, Mr. George B",male,71.0,0.0,0.0,PC 17754,34.6542,A5,C,,,"New York, NY"
9,1.0,0.0,"Artagaveytia, Mr. Ramon",male,71.0,0.0,0.0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,3.0,0.0,"Wiseman, Mr. Phillippe",male,,0.0,0.0,A/4. 34244,7.2500,,S,,,
1302,3.0,0.0,"Yousif, Mr. Wazli",male,,0.0,0.0,2647,7.2250,,C,,,
1303,3.0,0.0,"Yousseff, Mr. Gerious",male,,0.0,0.0,2627,14.4583,,C,,,
1305,3.0,0.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,


In [43]:
#Dataframe.reset_index(drop = true)

In [54]:
new_titanic_survival.reset_index()

Unnamed: 0,index,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,14,1.0,1.0,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0.0,0.0,27042,30.0000,A23,S,B,,"Hessle, Yorks"
1,61,1.0,1.0,"Cavendish, Mrs. Tyrell William (Julia Florence...",female,76.0,1.0,0.0,19877,78.8500,C46,S,6,,"Little Onn Hall, Staffs"
2,1235,3.0,0.0,"Svensson, Mr. Johan",male,74.0,0.0,0.0,347060,7.7750,,S,,,
3,135,1.0,0.0,"Goldschmidt, Mr. George B",male,71.0,0.0,0.0,PC 17754,34.6542,A5,C,,,"New York, NY"
4,9,1.0,0.0,"Artagaveytia, Mr. Ramon",male,71.0,0.0,0.0,PC 17609,49.5042,,C,,22.0,"Montevideo, Uruguay"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,1297,3.0,0.0,"Wiseman, Mr. Phillippe",male,,0.0,0.0,A/4. 34244,7.2500,,S,,,
1306,1302,3.0,0.0,"Yousif, Mr. Wazli",male,,0.0,0.0,2647,7.2250,,C,,,
1307,1303,3.0,0.0,"Yousseff, Mr. Gerious",male,,0.0,0.0,2627,14.4583,,C,,,
1308,1305,3.0,0.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,


In [60]:
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
## drop=True => on supprime la colonne index 
# drop=False=> Ajoute la colonne d'index remplacée aux données si False

In [61]:
titanic_reindexed.iloc[0:5,0:3]

Unnamed: 0,pclass,survived,name
0,1.0,1.0,"Barkworth, Mr. Algernon Henry Wilson"
1,1.0,1.0,"Cavendish, Mrs. Tyrell William (Julia Florence..."
2,3.0,0.0,"Svensson, Mr. Johan"
3,1.0,0.0,"Goldschmidt, Mr. George B"
4,1.0,0.0,"Artagaveytia, Mr. Ramon"


### Appliquer des fonctions sur un DataFrame

In [62]:
#DataFrame.apply() par defaut applique la fonction sur chaque colonne 
#de notre data frame 

In [63]:
#soit une fonction qui retourne le 100e element 
def row_100(column):
    #extraire le 100e element d'une colonne
    item = column.iloc[99]
    return item

In [68]:
#retourne le 100e de chaque colonne 
row_100_var = titanic_survival.apply(row_100)
row_100_var

pclass                                                       1
survived                                                     1
name         Duff Gordon, Lady. (Lucille Christiana Sutherl...
sex                                                     female
age                                                         48
sibsp                                                        1
parch                                                        0
ticket                                                   11755
fare                                                      39.6
cabin                                                      A16
embarked                                                     C
boat                                                         1
body                                                       NaN
home.dest                                       London / Paris
dtype: object

Exercice

Une fonction qui calcule le nombre d'elements manquants : 

In [85]:
def nbr_element_manquant(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)

In [86]:
titanic_survival.apply(nbr_element_manquant)

pclass          1
survived        1
name            1
sex             1
age           264
sibsp           1
parch           1
ticket          1
fare            2
cabin        1015
embarked        3
boat          824
body         1189
home.dest     565
dtype: int64

### Appliquer une fonction à une ligne

In [87]:
#DataFrame.apply(function, axis=1)

In [90]:
def is_minor(row):
    if row["age"] < 18:
        return True
    else:
        return False