In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {'class': [1, 1, 1, 2, 2],
        'score': [10, 21, 35, 11, 26],
        'result': [0, 1, 0, 1, 0],
        'performance': ['strong', 'weak', 'normal', 'weak', 'strong']}
df = pd.DataFrame(data)

In [3]:
df

Unnamed: 0,class,score,result,performance
0,1,10,0,strong
1,1,21,1,weak
2,1,35,0,normal
3,2,11,1,weak
4,2,26,0,strong


In [4]:
df.result.map({0:'fail', 1:'pass'})

0    fail
1    pass
2    fail
3    pass
4    fail
Name: result, dtype: object

In [5]:
df['pass or fail'] = df.result.map({0:'fail', 1:'pass'})
df

Unnamed: 0,class,score,result,performance,pass or fail
0,1,10,0,strong,fail
1,1,21,1,weak,pass
2,1,35,0,normal,fail
3,2,11,1,weak,pass
4,2,26,0,strong,fail


#### Applying functions

In [6]:
def stand(x):
    mean=df.score.mean()
    std=df.score.std()
    y = (x-mean)/std
    return y

In [7]:
df['score standard']=df.score.apply(stand) # on utilise la fonction définie

In [8]:
df

Unnamed: 0,class,score,result,performance,pass or fail,score standard
0,1,10,0,strong,fail,-1.009295
1,1,21,1,weak,pass,0.038087
2,1,35,0,normal,fail,1.371118
3,2,11,1,weak,pass,-0.914078
4,2,26,0,strong,fail,0.514169


Let's now look at using the apply() function on an entire DataFrame. To do this, we will consider the DataFrame consisting of the numerical columns. We can select these by column labels using the loc function as follows

ou utiliser df.select_dtypes(include='number') or df.select_dtypes(include=np.number).

In [9]:
df_num =df.loc[:, ['class', 'result', 'score', 'score standard']]
df_num

Unnamed: 0,class,result,score,score standard
0,1,0,10,-1.009295
1,1,1,21,0.038087
2,1,0,35,1.371118
3,2,1,11,-0.914078
4,2,0,26,0.514169


For this example, let's use the max() function from Python to determine the maximum value in each column:

In [10]:
df_num.apply(max, axis=0)

class              2.000000
result             1.000000
score             35.000000
score standard     1.371118
dtype: float64

In [11]:
df_num.apply(max, axis=1)

0    10.0
1    21.0
2    35.0
3    11.0
4    26.0
dtype: float64

Finally, let's consider the **applymap()** function. Now, this function is for DataFrames, and **it is used for applying a function to every element** of a DataFrame. This is in contrast to apply(), which is for either rows or column.

In [12]:
df_num.applymap(lambda x: '\$' + str(x))

Unnamed: 0,class,result,score,score standard
0,\$1,\$0,\$10,\$-1.0092949703936966
1,\$1,\$1,\$21,\$0.03808660265636577
2,\$1,\$0,\$35,\$1.3711176956291724
3,\$2,\$1,\$11,\$-0.9140784637527819
4,\$2,\$0,\$26,\$0.5141691358609396


In [13]:
import pandas as pd
import numpy as np

df = pd.read_csv('cereal.csv')

In [14]:
df[0:5]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


Create a new column called Manufacturer which takes the entries in the column mfr and maps them to the full name as follows

In [15]:
df['Manufacturer'] = df['mfr'].map({'A':'American Home Food Products', 'G':'General Mills','K': 'Kelloggs', 'N':'Nabisco', 'P': 'Post', 'Q':'Quaker Oats', 'R':'Ralston Purina'})
df[0:5]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,Manufacturer
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973,Nabisco
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679,Quaker Oats
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505,Kelloggs
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912,Kelloggs
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843,Ralston Purina


 Calories can be converted to kilojoules using the following formula: 1 calorie = 4.184 kilojoules. Find for the amount of Kilojoules per serving for each cereal. Store the results in a new column.

In [16]:
#def cal_to_kj(calories) :
#    kj = calories * 4.184
#    return kj

In [17]:
df['kilojoules'] = df['calories'].apply(lambda x : x * 4.184)
df[0:5]

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,Manufacturer,kilojoules
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973,Nabisco,292.88
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679,Quaker Oats,502.08
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505,Kelloggs,292.88
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912,Kelloggs,209.2
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843,Ralston Purina,460.24


In [21]:
df.set_index('name', inplace = True)
df_num = df.select_dtypes(include='number')  # ou _get_numeric_data() private 
df_num[0:5]

Unnamed: 0_level_0,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating,kilojoules
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
100% Bran,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973,292.88
100% Natural Bran,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679,502.08
All-Bran,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505,292.88
All-Bran with Extra Fiber,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912,209.2
Almond Delight,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843,460.24


In [24]:
df_num.apply(max, axis=0)

calories      160.000000
protein         6.000000
fat             5.000000
sodium        320.000000
fiber          14.000000
carbo          23.000000
sugars         15.000000
potass        330.000000
vitamins      100.000000
shelf           3.000000
weight          1.500000
cups            1.500000
rating         93.704912
kilojoules    669.440000
dtype: float64

In [27]:
df_num.apply(np.argmax)  # argmax pour renvoyer l'index et non la valeur !!

calories             Mueslix Crispy Blend
protein                          Cheerios
fat                     100% Natural Bran
sodium                         Product 19
fiber           All-Bran with Extra Fiber
carbo                           Rice Chex
sugars                       Golden Crisp
potass          All-Bran with Extra Fiber
vitamins      Just Right Crunchy  Nuggets
shelf                           100% Bran
weight               Mueslix Crispy Blend
cups                                  Kix
rating          All-Bran with Extra Fiber
kilojoules           Mueslix Crispy Blend
dtype: object

In other words, while there are no true private methods in Python, a method whose name begins with an underscore should nevertheless be considered private. Concretely, what this means is that you can use the "private" methods if you want to - the language itself won't stop you from doing that - but don't expect your code to work when you upgrade a specific library where you used a private method, as they may change without notice.

Thus, for quick tests and scripts it's fine to use private methods, but in general, it should be avoided. There is usually another, non-private way to solve the problem at hand. In the solution above, for example, the line

df_num = df._get_numeric_data()
could be replaced by the use of the non-private method select_dtypes():

df_num = df.select_dtypes([np.number])