# Méthodes de création de MultiIndex

In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.DataFrame(np.random.rand(5,3),
                  index = [['A','A','A', 'B', 'B'], ['a1','a2','a3','b1','b2']],
                  columns = ['data1', 'data2', 'data3'])

In [3]:
df1

Unnamed: 0,Unnamed: 1,data1,data2,data3
A,a1,0.098412,0.92074,0.140058
A,a2,0.408318,0.571721,0.817417
A,a3,0.293296,0.97092,0.34449
B,b1,0.302856,0.312861,0.349908
B,b2,0.20448,0.827291,0.764828


In [4]:
groupe = ['A','A','A','B','B']
sous_groupe = ['a1','a2','a3','b1','b2']
couples = list(zip(groupe, sous_groupe))

In [5]:
couples

[('A', 'a1'), ('A', 'a2'), ('A', 'a3'), ('B', 'b1'), ('B', 'b2')]

In [6]:
couple_index = pd.MultiIndex.from_tuples(couples)

In [7]:
couple_index

MultiIndex([('A', 'a1'),
            ('A', 'a2'),
            ('A', 'a3'),
            ('B', 'b1'),
            ('B', 'b2')],
           )

In [8]:
df2 = pd.DataFrame(np.random.rand(5,3), couple_index, ['data1', 'data2', 'data3'])

In [9]:
df2

Unnamed: 0,Unnamed: 1,data1,data2,data3
A,a1,0.221677,0.805324,0.761171
A,a2,0.914111,0.58123,0.257671
A,a3,0.626013,0.243228,0.101027
B,b1,0.382557,0.812313,0.763641
B,b2,0.434851,0.488703,0.141115


In [10]:
df2.loc['A']

Unnamed: 0,data1,data2,data3
a1,0.221677,0.805324,0.761171
a2,0.914111,0.58123,0.257671
a3,0.626013,0.243228,0.101027


In [11]:
df2.loc['A'].loc['a1']

data1    0.221677
data2    0.805324
data3    0.761171
Name: a1, dtype: float64

In [12]:
df2.loc['A','a1']

data1    0.221677
data2    0.805324
data3    0.761171
Name: (A, a1), dtype: float64

In [13]:
df2.loc['A','a3']['data3']

0.10102665488446982

In [14]:
df2

Unnamed: 0,Unnamed: 1,data1,data2,data3
A,a1,0.221677,0.805324,0.761171
A,a2,0.914111,0.58123,0.257671
A,a3,0.626013,0.243228,0.101027
B,b1,0.382557,0.812313,0.763641
B,b2,0.434851,0.488703,0.141115


In [15]:
df2.index.names = ['Niveau1', 'Niveau2']

In [16]:
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2,data3
Niveau1,Niveau2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,a1,0.221677,0.805324,0.761171
A,a2,0.914111,0.58123,0.257671
A,a3,0.626013,0.243228,0.101027
B,b1,0.382557,0.812313,0.763641
B,b2,0.434851,0.488703,0.141115


# Exemple 2 de création de MultiIndex et accès aux données d'un dataframe multiindexé: 

In [17]:
data = {'Ville':['Paris','Paris','Paris', 'Marseille','Marseille','Marseille', 
                 'Lyon','Lyon','Lyon', 'Toulouse','Toulouse','Toulouse'],
        'Annee':[2018,2013,2008,2018,2013,2008,2018,2013,2008,2018,2013,2008],
        'Population':[2175601,229621,221197,868277,855393,851420,518635,500715,474946,486828,458298,439553]
       }

In [18]:
df = pd.DataFrame(data, columns = ['Ville', 'Annee', 'Population'])

In [19]:
df

Unnamed: 0,Ville,Annee,Population
0,Paris,2018,2175601
1,Paris,2013,229621
2,Paris,2008,221197
3,Marseille,2018,868277
4,Marseille,2013,855393
5,Marseille,2008,851420
6,Lyon,2018,518635
7,Lyon,2013,500715
8,Lyon,2008,474946
9,Toulouse,2018,486828


In [20]:
df.index

RangeIndex(start=0, stop=12, step=1)

In [21]:
df.set_index(['Ville','Annee'], inplace = True)

In [22]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Population
Ville,Annee,Unnamed: 2_level_1
Paris,2018,2175601
Paris,2013,229621
Paris,2008,221197
Marseille,2018,868277
Marseille,2013,855393
Marseille,2008,851420
Lyon,2018,518635
Lyon,2013,500715
Lyon,2008,474946
Toulouse,2018,486828


In [23]:
df.loc['Marseille', 2018]

Population    868277
Name: (Marseille, 2018), dtype: int64

In [24]:
df.loc['Paris']

Unnamed: 0_level_0,Population
Annee,Unnamed: 1_level_1
2018,2175601
2013,229621
2008,221197


In [25]:
df.loc[[('Paris',2018),('Marseille',2008)]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Population
Ville,Annee,Unnamed: 2_level_1
Paris,2018,2175601
Marseille,2008,851420


In [26]:
df.xs(key = 2018, level = 'Annee', drop_level = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Population
Ville,Annee,Unnamed: 2_level_1
Paris,2018,2175601
Marseille,2018,868277
Lyon,2018,518635
Toulouse,2018,486828


# Combinaison de dataframes

In [27]:
data1 = {'Paris':[75000,2175601,321,392,177,181], 
         'Marseille': [13000,868277,219,299,99,54],
         'Lyon':[69000,518635,78,90,26,17]}

In [28]:
df1 = pd.DataFrame(data1, index = ['code postal', 'population', 'ecole maternelle', 'ecole elementaire', 'college', 'lycee'])

In [29]:
df1

Unnamed: 0,Paris,Marseille,Lyon
code postal,75000,13000,69000
population,2175601,868277,518635
ecole maternelle,321,219,78
ecole elementaire,392,299,90
college,177,99,26
lycee,181,54,17


In [30]:
data2 = {'Avignon':[84000,92130,26,23037], 'Bordeaux': [33000,249212,59,75974]}
df2 = pd.DataFrame(data2, index = ['code postal', 'population','ecole maternelle', 'nombre eleves' ])

In [31]:
df2

Unnamed: 0,Avignon,Bordeaux
code postal,84000,33000
population,92130,249212
ecole maternelle,26,59
nombre eleves,23037,75974


In [32]:
df1_df2 = pd.concat([df1,df2])

In [33]:
df1_df2

Unnamed: 0,Paris,Marseille,Lyon,Avignon,Bordeaux
code postal,75000.0,13000.0,69000.0,,
population,2175601.0,868277.0,518635.0,,
ecole maternelle,321.0,219.0,78.0,,
ecole elementaire,392.0,299.0,90.0,,
college,177.0,99.0,26.0,,
lycee,181.0,54.0,17.0,,
code postal,,,,84000.0,33000.0
population,,,,92130.0,249212.0
ecole maternelle,,,,26.0,59.0
nombre eleves,,,,23037.0,75974.0


In [34]:
df1_df2 = pd.concat([df1,df2], axis = 1)

In [35]:
df1_df2

Unnamed: 0,Paris,Marseille,Lyon,Avignon,Bordeaux
code postal,75000.0,13000.0,69000.0,84000.0,33000.0
population,2175601.0,868277.0,518635.0,92130.0,249212.0
ecole maternelle,321.0,219.0,78.0,26.0,59.0
ecole elementaire,392.0,299.0,90.0,,
college,177.0,99.0,26.0,,
lycee,181.0,54.0,17.0,,
nombre eleves,,,,23037.0,75974.0


In [36]:
df1_df2 = pd.concat([df1,df2], axis = 1, join = 'inner')

In [37]:
df1_df2

Unnamed: 0,Paris,Marseille,Lyon,Avignon,Bordeaux
code postal,75000,13000,69000,84000,33000
population,2175601,868277,518635,92130,249212
ecole maternelle,321,219,78,26,59


In [38]:
data_exemple = {'cle':['A','B','C','A','B','C'], 'data': range(6)}

In [39]:
df1 = pd.DataFrame(data_exemple)

In [40]:
df1

Unnamed: 0,cle,data
0,A,0
1,B,1
2,C,2
3,A,3
4,B,4
5,C,5


In [41]:
df1.groupby('cle')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000018086CDF5B0>

In [42]:
group_par_cle = df1.groupby('cle')

In [43]:
group_par_cle.sum()

Unnamed: 0_level_0,data
cle,Unnamed: 1_level_1
A,3
B,5
C,7


In [44]:
group_par_cle.mean()

Unnamed: 0_level_0,data
cle,Unnamed: 1_level_1
A,1.5
B,2.5
C,3.5


In [45]:
data = {'Nom': ['Christophe','Clara','Lina','Amine','Amandine','Christophe','Clara','Lina','Amine','Amandine'],
    'Matieres':['Maths','Maths','Maths','Maths','Maths','Physique','Physique','Physique','Physique','Physique'],
        'Notes':[17,13,12,15,9,10,5,8,13,16]
       }

In [46]:
df = pd.DataFrame(data)

In [47]:
df

Unnamed: 0,Nom,Matieres,Notes
0,Christophe,Maths,17
1,Clara,Maths,13
2,Lina,Maths,12
3,Amine,Maths,15
4,Amandine,Maths,9
5,Christophe,Physique,10
6,Clara,Physique,5
7,Lina,Physique,8
8,Amine,Physique,13
9,Amandine,Physique,16


In [48]:
groupby_nom = df.groupby('Nom')

In [49]:
groupby_nom.mean()

Unnamed: 0_level_0,Notes
Nom,Unnamed: 1_level_1
Amandine,12.5
Amine,14.0
Christophe,13.5
Clara,9.0
Lina,10.0


In [50]:
groupby_nom.sum()

Unnamed: 0_level_0,Notes
Nom,Unnamed: 1_level_1
Amandine,25
Amine,28
Christophe,27
Clara,18
Lina,20


In [51]:
groupby_nom.std()

Unnamed: 0_level_0,Notes
Nom,Unnamed: 1_level_1
Amandine,4.949747
Amine,1.414214
Christophe,4.949747
Clara,5.656854
Lina,2.828427


In [52]:
groupby_nom.mean().loc['Amine']

Notes    14.0
Name: Amine, dtype: float64

In [53]:
groupby_nom.std().loc['Clara']

Notes    5.656854
Name: Clara, dtype: float64

In [54]:
df.groupby('Nom').mean().loc['Amandine']

Notes    12.5
Name: Amandine, dtype: float64

In [55]:
df.groupby('Nom').count()

Unnamed: 0_level_0,Matieres,Notes
Nom,Unnamed: 1_level_1,Unnamed: 2_level_1
Amandine,2,2
Amine,2,2
Christophe,2,2
Clara,2,2
Lina,2,2


In [56]:
df.groupby('Nom').max()

Unnamed: 0_level_0,Matieres,Notes
Nom,Unnamed: 1_level_1,Unnamed: 2_level_1
Amandine,Physique,16
Amine,Physique,15
Christophe,Physique,17
Clara,Physique,13
Lina,Physique,12


In [57]:
df

Unnamed: 0,Nom,Matieres,Notes
0,Christophe,Maths,17
1,Clara,Maths,13
2,Lina,Maths,12
3,Amine,Maths,15
4,Amandine,Maths,9
5,Christophe,Physique,10
6,Clara,Physique,5
7,Lina,Physique,8
8,Amine,Physique,13
9,Amandine,Physique,16


In [58]:
df.groupby('Nom').describe()

Unnamed: 0_level_0,Notes,Notes,Notes,Notes,Notes,Notes,Notes,Notes
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Nom,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Amandine,2.0,12.5,4.949747,9.0,10.75,12.5,14.25,16.0
Amine,2.0,14.0,1.414214,13.0,13.5,14.0,14.5,15.0
Christophe,2.0,13.5,4.949747,10.0,11.75,13.5,15.25,17.0
Clara,2.0,9.0,5.656854,5.0,7.0,9.0,11.0,13.0
Lina,2.0,10.0,2.828427,8.0,9.0,10.0,11.0,12.0


In [59]:
df.groupby('Nom').describe().loc['Lina']

Notes  count     2.000000
       mean     10.000000
       std       2.828427
       min       8.000000
       25%       9.000000
       50%      10.000000
       75%      11.000000
       max      12.000000
Name: Lina, dtype: float64