# **Basic aggregation methods:**

count()
mean()
median()
min()
max()
std()
var()
sum()
idxmin()
idxmax()
corr()

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
np.random.seed(42)
df = pd.DataFrame(np.random.randint(0,100,size=(7,5)),
                  columns=["x1","x2","x3","x4","x5"])
df

Unnamed: 0,x1,x2,x3,x4,x5
0,51,92,14,71,60
1,20,82,86,74,74
2,87,99,23,2,21
3,52,1,87,29,37
4,1,63,59,20,32
5,75,57,21,88,48
6,90,58,41,91,59


In [None]:
df.count()

x1    7
x2    7
x3    7
x4    7
x5    7
dtype: int64

In [None]:
df.x1.count()

7

In [None]:
df.mean()

x1    53.714286
x2    64.571429
x3    47.285714
x4    53.571429
x5    47.285714
dtype: float64

In [None]:
df.median()

x1    52.0
x2    63.0
x3    41.0
x4    71.0
x5    48.0
dtype: float64

In [None]:
df.min()

x1     1
x2     1
x3    14
x4     2
x5    21
dtype: int64

In [None]:
df.x4.min()

2

In [None]:
df.idxmin()   #Return index of first occurrence of minimum over requested axis

x1    4
x2    3
x3    0
x4    2
x5    2
dtype: int64

In [None]:
df.argmin()  #does not work on dataframe

AttributeError: ignored

In [None]:
df.x5.idxmin()

2

In [None]:
df.x5.argmin()  #works on series

2

In [None]:
df.std()

x1    33.673502
x2    32.623392
x3    30.663302
x4    35.818325
x5    18.454577
dtype: float64

In [None]:
df[["x1","x2"]].std()

x1    33.673502
x2    32.623392
dtype: float64

In [None]:
df.var()

x1    1133.904762
x2    1064.285714
x3     940.238095
x4    1282.952381
x5     340.571429
dtype: float64

In [None]:
df.sum(axis=0)

x1    376
x2    452
x3    331
x4    375
x5    331
dtype: int64

In [None]:
df.sum(axis=1)

0    288
1    336
2    232
3    206
4    175
5    289
6    339
dtype: int64

In [None]:
df.x1.sum()

376

In [None]:
df.describe()

Unnamed: 0,x1,x2,x3,x4,x5
count,7.0,7.0,7.0,7.0,7.0
mean,53.714286,64.571429,47.285714,53.571429,47.285714
std,33.673502,32.623392,30.663302,35.818325,18.454577
min,1.0,1.0,14.0,2.0,21.0
25%,35.5,57.5,22.0,24.5,34.5
50%,52.0,63.0,41.0,71.0,48.0
75%,81.0,87.0,72.5,81.0,59.5
max,90.0,99.0,87.0,91.0,74.0


In [None]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
x1,7.0,53.714286,33.673502,1.0,35.5,52.0,81.0,90.0
x2,7.0,64.571429,32.623392,1.0,57.5,63.0,87.0,99.0
x3,7.0,47.285714,30.663302,14.0,22.0,41.0,72.5,87.0
x4,7.0,53.571429,35.818325,2.0,24.5,71.0,81.0,91.0
x5,7.0,47.285714,18.454577,21.0,34.5,48.0,59.5,74.0


# **Groupby & Aggregation**

**DataFrame.groupby()**

The groupby method allows you to group rows of data together and call aggregate functions

In [None]:
df=sns.load_dataset("iris")
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [None]:
df.groupby("species")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f3c25cba790>

In [None]:
df.groupby("species").mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [None]:
df.groupby("species").describe().T

Unnamed: 0,species,setosa,versicolor,virginica
sepal_length,count,50.0,50.0,50.0
sepal_length,mean,5.006,5.936,6.588
sepal_length,std,0.35249,0.516171,0.63588
sepal_length,min,4.3,4.9,4.9
sepal_length,25%,4.8,5.6,6.225
sepal_length,50%,5.0,5.9,6.5
sepal_length,75%,5.2,6.3,6.9
sepal_length,max,5.8,7.0,7.9
sepal_width,count,50.0,50.0,50.0
sepal_width,mean,3.428,2.77,2.974


In [None]:
df.groupby("species")["sepal_length"].sum()

species
setosa        250.3
versicolor    296.8
virginica     329.4
Name: sepal_length, dtype: float64

In [None]:
df.groupby("species")[["sepal_length"]].sum()

Unnamed: 0_level_0,sepal_length
species,Unnamed: 1_level_1
setosa,250.3
versicolor,296.8
virginica,329.4


In [None]:
df.groupby('species')[['sepal_length', "sepal_width"]].sum()

Unnamed: 0_level_0,sepal_length,sepal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1
setosa,250.3,171.4
versicolor,296.8,138.5
virginica,329.4,148.7


In [None]:
data = {'Company':['GOOG', 'GOOG', 'MSFT', 'MSFT', 'GOOG', 'MSFT', 'GOOG', 'MSFT'],
        'Department':['HR', 'IT', 'IT', 'HR', 'HR', 'IT', 'IT', 'HR'],
        'Person':['Sam', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah', 'Tom', 'Terry'],
        'Age':[30, 28, 35, 40, 42, 25, 32, 48],
        'Sales':[200, 120, 340, 124, 243, 350, 180, 220]}

In [None]:
df1=pd.DataFrame(data)
df1

Unnamed: 0,Company,Department,Person,Age,Sales
0,GOOG,HR,Sam,30,200
1,GOOG,IT,Charlie,28,120
2,MSFT,IT,Amy,35,340
3,MSFT,HR,Vanessa,40,124
4,GOOG,HR,Carl,42,243
5,MSFT,IT,Sarah,25,350
6,GOOG,IT,Tom,32,180
7,MSFT,HR,Terry,48,220


In [None]:
df1.groupby("Company").mean()

Unnamed: 0_level_0,Age,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
GOOG,33.0,185.75
MSFT,37.0,258.5


In [None]:
df1.groupby("Company")[["Sales"]].mean()

Unnamed: 0_level_0,Sales
Company,Unnamed: 1_level_1
GOOG,185.75
MSFT,258.5


In [None]:
df1.groupby(["Company","Department"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Sales
Company,Department,Unnamed: 2_level_1,Unnamed: 3_level_1
GOOG,HR,36.0,221.5
GOOG,IT,30.0,150.0
MSFT,HR,44.0,172.0
MSFT,IT,30.0,345.0


In [None]:
df1.groupby(["Company","Department"])[["Sales"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales
Company,Department,Unnamed: 2_level_1
GOOG,HR,221.5
GOOG,IT,150.0
MSFT,HR,172.0
MSFT,IT,345.0


In [None]:
df1.groupby(['Company', "Department"])[["Sales"]].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
Company,Department,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
GOOG,HR,2.0,221.5,30.405592,200.0,210.75,221.5,232.25,243.0
GOOG,IT,2.0,150.0,42.426407,120.0,135.0,150.0,165.0,180.0
MSFT,HR,2.0,172.0,67.882251,124.0,148.0,172.0,196.0,220.0
MSFT,IT,2.0,345.0,7.071068,340.0,342.5,345.0,347.5,350.0


In [None]:
df1.groupby('Company')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f3c257e94d0>

In [None]:
by_comp=df1.groupby('Company')  #assign the object a value and work easily

In [None]:
by_comp.mean()

Unnamed: 0_level_0,Age,Sales
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
GOOG,33.0,185.75
MSFT,37.0,258.5


# **DataFrame/Series Operations**

.aggregate()/agg()
.filter()
.transform()
.apply()
.applymap()
.map()
.pivot() & .pivot_table()
.stack() & .unstack()

**.aggregate()/agg()**

In [None]:
df2 = pd.DataFrame({'groups': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'],
                   'var1': [10, 23, 33, 22, 11, 99, 76, 84, 45],
                   'var2': [100, 253, 333, 262, 111, 969, 405, 578, 760]})
df2

Unnamed: 0,groups,var1,var2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969
6,A,76,405
7,B,84,578
8,C,45,760


In [None]:
df2.agg([sum,min])

Unnamed: 0,groups,var1,var2
sum,ABCABCABC,403,3771
min,A,10,100


In [None]:
df2[["var1","var2"]].agg([sum,min])

Unnamed: 0,var1,var2
sum,403,3771
min,10,100


In [None]:
df2.agg({"var1":[sum],"var2":[min]})

Unnamed: 0,var1,var2
sum,403.0,
min,,100.0


In [None]:
df2.agg({"var1":[sum,np.mean],"var2":[min,max]})

Unnamed: 0,var1,var2
sum,403.0,
mean,44.777778,
min,,100.0
max,,969.0


 **.groupby.agg**

In [None]:
df2.groupby("groups").agg([np.mean,np.median,max])

Unnamed: 0_level_0,var1,var1,var1,var2,var2,var2
Unnamed: 0_level_1,mean,median,max,mean,median,max
groups,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,36.0,22.0,76,255.666667,262.0,405
B,39.333333,23.0,84,314.0,253.0,578
C,59.0,45.0,99,687.333333,760.0,969


In [None]:
df2.groupby("groups")["var1"].agg([min,max])

Unnamed: 0_level_0,min,max
groups,Unnamed: 1_level_1,Unnamed: 2_level_1
A,10,76
B,11,84
C,33,99


**.filter()**
DataFrame.filter()

DataFrame.filter(items=None, like=None, regex=None, axis=None)
Subset the dataframe rows or columns according to the specified index labels.
Note that this routine does not filter a dataframe on its contents. The filter is applied to the labels of the index.

In [None]:
df2

Unnamed: 0,groups,var1,var2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969
6,A,76,405
7,B,84,578
8,C,45,760


In [None]:
df2.filter(["groups","var1"])
#df2[["groups","var1"]]

Unnamed: 0,groups,var1
0,A,10
1,B,23
2,C,33
3,A,22
4,B,11
5,C,99
6,A,76
7,B,84
8,C,45


**regex()**

pull wanted items from a string statement.(İstediğin bölümü string içinden çeker)

Short for regular expression, a regex is a string of text that allows you to create patterns that help match, locate, and manage text.

In [None]:
df2.filter(regex="^var", axis=1)      # ^ means start with .. 

Unnamed: 0,var1,var2
0,10,100
1,23,253
2,33,333
3,22,262
4,11,111
5,99,969
6,76,405
7,84,578
8,45,760


In [None]:
df2.filter(regex="5", axis=0)

Unnamed: 0,groups,var1,var2
5,C,99,969


In [None]:
df2.filter(like="5", axis=0)

Unnamed: 0,groups,var1,var2
5,C,99,969


# **DataFrame.groupby().filter()**

DataFrameGroupBy.filter(func, dropna=True, *args, **kwargs)

Return a copy of a DataFrame excluding filtered elements.

Elements from groups are filtered if they do not satisfy the boolean criterion specified by func.
filter(function or None, iterable) --> filter object

Return an iterator yielding those items of iterable for which function(item)
is true. If function is None, return the items that are true.

In [None]:
df2.groups.unique()

array(['A', 'B', 'C'], dtype=object)

In [None]:
df2.groupby("groups").mean()

Unnamed: 0_level_0,var1,var2
groups,Unnamed: 1_level_1,Unnamed: 2_level_1
A,36.0,255.666667
B,39.333333,314.0
C,59.0,687.333333


In [None]:
def filter_func(x):
    return x["var1"].mean() > 39

In [None]:
df2.groupby("groups").filter(filter_func)   #filter wants function 

Unnamed: 0,groups,var1,var2
1,B,23,253
2,C,33,333
4,B,11,111
5,C,99,969
7,B,84,578
8,C,45,760


In [None]:
df2.groupby("groups").filter(lambda x :x["var2"].sum()<800)

Unnamed: 0,groups,var1,var2
0,A,10,100
3,A,22,262
6,A,76,405


**.transform()**

**DataFrame.transform()**

**DataFrame.transform(func, axis=0, args, kwargs) **

Python’s Transform function returns a self-produced dataframe with transformed values after applying the function specified in its parameter.
This dataframe has the same length as the passed dataframe.

In [None]:
df2

Unnamed: 0,groups,var1,var2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969
6,A,76,405
7,B,84,578
8,C,45,760


In [None]:
df_num = df2.iloc[:,1:]

In [None]:
df_num

Unnamed: 0,var1,var2
0,10,100
1,23,253
2,33,333
3,22,262
4,11,111
5,99,969
6,76,405
7,84,578
8,45,760


In [None]:
df_num.transform(lambda x : x+10)
#df_num + 10



Unnamed: 0,var1,var2
0,20,110
1,33,263
2,43,343
3,32,272
4,21,121
5,109,979
6,86,415
7,94,588
8,55,770


In [None]:
# returns A DataFrame that must have the same length as self. Standardization(mean 0 std 1) Normalization (0-1)
df_num.transform(lambda x : (x-x.mean())/x.std())
#(df_num-df_num.mean())/df_num.std()

Unnamed: 0,var1,var2
0,-1.040605,-1.078044
1,-0.651625,-0.560989
2,-0.352409,-0.290633
3,-0.681547,-0.530573
4,-1.010684,-1.04087
5,1.622413,1.858697
6,0.934218,-0.047312
7,1.17359,0.537332
8,0.006649,1.152392


In [None]:
df_num.var1.transform([np.sqrt, np.exp])   # for only var1 series


Unnamed: 0,sqrt,exp
0,3.162278,22026.47
1,4.795832,9744803000.0
2,5.744563,214643600000000.0
3,4.690416,3584913000.0
4,3.316625,59874.14
5,9.949874,9.889030000000001e+42
6,8.717798,1.0148e+33
7,9.165151,3.025077e+36
8,6.708204,3.493427e+19


In [None]:
df2.groupby("groups")["var1"].mean()

groups
A    36.000000
B    39.333333
C    59.000000
Name: var1, dtype: float64

In [None]:
df2

Unnamed: 0,groups,var1,var2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969
6,A,76,405
7,B,84,578
8,C,45,760


In [None]:
df2.groupby("groups")["var1"].transform("mean")   # func in transform.agg methods in transform

0    36.000000
1    39.333333
2    59.000000
3    36.000000
4    39.333333
5    59.000000
6    36.000000
7    39.333333
8    59.000000
Name: var1, dtype: float64

In [None]:
df2["var1_mean_transform"] = df2.groupby("groups")["var1"].transform("mean")
df2

Unnamed: 0,groups,var1,var2,var1_mean_transform
0,A,10,100,36.0
1,B,23,253,39.333333
2,C,33,333,59.0
3,A,22,262,36.0
4,B,11,111,39.333333
5,C,99,969,59.0
6,A,76,405,36.0
7,B,84,578,39.333333
8,C,45,760,59.0


In [None]:
df2["var2_median_transform"] = df2.groupby("groups")["var2"].transform("median")
df2

Unnamed: 0,groups,var1,var2,var1_mean_transform,var2_median_transform
0,A,10,100,36.0,262.0
1,B,23,253,39.333333,253.0
2,C,33,333,59.0,760.0
3,A,22,262,36.0,262.0
4,B,11,111,39.333333,253.0
5,C,99,969,59.0,760.0
6,A,76,405,36.0,262.0
7,B,84,578,39.333333,253.0
8,C,45,760,59.0,760.0


**.apply()**

**Series.apply() - df["col"].apply()**

Series.apply(func, convert_dtype=True, args=(), **kwargs)

Invoke function on values of Series.

Can be ufunc (a NumPy function that applies to the entire Series) or a Python function that only works on single values.

In [None]:
df3 = pd.DataFrame({'col1':[1,2,3,4],'col2':[444,555,666,444],'col3':['abc','def','ghi','xyz']})

df3

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [None]:
def squared(x):
  return x**2

In [None]:
df3["col1"].apply(squared)     #df3["col1"]**2

0     1
1     4
2     9
3    16
Name: col1, dtype: int64

In [None]:
df3["col2"].apply(np.log)

0    6.095825
1    6.318968
2    6.501290
3    6.095825
Name: col2, dtype: float64

In [None]:
df3["col3"].apply(len)

0    3
1    3
2    3
3    3
Name: col3, dtype: int64

In [None]:
df3["col3"].apply(lambda x : x[0]*3)

0    aaa
1    ddd
2    ggg
3    xxx
Name: col3, dtype: object

In [None]:
df3["col2"].apply(lambda x : "high" if x > 500 else "low")

0     low
1    high
2    high
3     low
Name: col2, dtype: object

**DataFrame.apply()**

DataFrame.apply(func, axis=0, raw=False, result_type=None, args=(), **kwargs)
Apply a function along an axis of the DataFrame.

In [None]:
df2

Unnamed: 0,groups,var1,var2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969
6,A,76,405
7,B,84,578
8,C,45,760


In [None]:
df2.apply(np.sum)

groups    ABCABCABC
var1            403
var2           3771
dtype: object

In [None]:
# It gives an error due to the non-numeric column : "groups"

df2.apply(np.sum, axis=1)

In [None]:
df_num

Unnamed: 0,var1,var2
0,10,100
1,23,253
2,33,333
3,22,262
4,11,111
5,99,969
6,76,405
7,84,578
8,45,760


In [None]:
df_num.apply(np.sum)
#df_num.sum()

var1     403
var2    3771
dtype: int64

In [None]:
df_num.apply(np.sum, axis=1)  #herbir satırı topladı
#df_num.sum(axis=1)

0     110
1     276
2     366
3     284
4     122
5    1068
6     481
7     662
8     805
dtype: int64

In [None]:
df_num.apply(lambda x : x + 10)

Unnamed: 0,var1,var2
0,20,110
1,33,263
2,43,343
3,32,272
4,21,121
5,109,979
6,86,415
7,94,588
8,55,770


In [None]:
df2.groupby('groups').apply(np.mean)  #df2.groupby("groups").mean()

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


Unnamed: 0_level_0,var1,var2
groups,Unnamed: 1_level_1,Unnamed: 2_level_1
A,36.0,255.666667
B,39.333333,314.0
C,59.0,687.333333


**DataFrame.applymap()**

DataFrame.applymap(func, na_action=None, **kwargs)

Apply a function to a Dataframe elementwise.

This method applies a function that accepts and returns a scalar to every element of a DataFrame.

applymap() is only available in DataFrame and used for element-wise operation across the whole DataFrame. It has been optimized and some cases work much faster than apply()

applymap() method only works on a pandas dataframe where function is applied on every element individually. apply() method can be applied both to series and dataframes where function can be applied both series and individual elements based on the type of function provided.

In [None]:
df_num

Unnamed: 0,var1,var2
0,10,100
1,23,253
2,33,333
3,22,262
4,11,111
5,99,969
6,76,405
7,84,578
8,45,760


In [None]:
df_num.applymap(lambda x:x*5)
#df_num.apply(lambda x: x*5)
#df_num*5

Unnamed: 0,var1,var2
0,50,500
1,115,1265
2,165,1665
3,110,1310
4,55,555
5,495,4845
6,380,2025
7,420,2890
8,225,3800


In [None]:
df_num.applymap(lambda x: len(str(x*5)))

Unnamed: 0,var1,var2
0,2,3
1,3,4
2,3,4
3,3,4
4,2,3
5,3,4
6,3,4
7,3,4
8,3,4


In [None]:
df_num.apply(lambda x: len(str(x*5)))  #apply saçmaladı

var1    105
var2    114
dtype: int64

**.map()**

**Series.map() - df["col"].map()**

Python's map() is a built-in function that allows you to process and transform all the items in an iterable without using an explicit for loop, a technique commonly known as mapping.

map() is useful when you need to apply a transformation function to each item in an iterable and transform them into a new iterable.
The map() function is used to map values of Series according to input correspondence. Used for substituting each value in a Series with another value, that may be derived from a function, a dict or a Series.

map() accepts a dict or a Series. Values that are not found in the dict are converted to NaN, unless the dict has a default value.

In [None]:
df3

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [None]:
df3.col1.map({1:"A",2:"B"})

0      A
1      B
2    NaN
3    NaN
Name: col1, dtype: object

In [None]:
s = pd.Series(['fox', 'cow', np.nan, 'dog'])
s

0    fox
1    cow
2    NaN
3    dog
dtype: object

In [None]:
s.map("I m a {}".format)

0    I m a fox
1    I m a cow
2    I m a nan
3    I m a dog
dtype: object

In [None]:
s.map('I am a {}'.format, na_action='ignore')

0    I am a fox
1    I am a cow
2           NaN
3    I am a dog
dtype: object

 
*   apply() is used to apply a function along an axis of the DataFrame or on values of Series.
*   applymap() is used to apply a function to a DataFrame elementwise.
*   map() is used to substitute each value in a Series with another value.








**df.transform() vs df.apply()**

**Similarties**

Both apply() and transform() can be used to manipulate the entire DataFrame.

Both apply() and transform() support lambda expression.

Both apply() and transform() can be used for manipulating a single column.

In [None]:
df5 = pd.DataFrame({'A': [1,2,3], 'B': [10,20,30] })
df5

Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30


In [None]:
df5['B_ap'] = df5['B'].apply(lambda x: x+10)
df5['B_tr'] = df5['B'].transform(lambda x: x+10)
df5

Unnamed: 0,A,B,B_ap,B_tr
0,1,10,20,20
1,2,20,30,30
2,3,30,40,40


**Differences between .apply() and .transform() when manupulating data**

transform() works with function, a string function, a list of functions, and a dict. However, apply() is only allowed with function.

transform() cannot produce aggregated results.

apply() works with multiple Series at a time. But, transform() is only allowed to work with a single Series at a time.

In [None]:
df5 = df5[["A","B"]]
df5

Unnamed: 0,A,B
0,1,10
1,2,20
2,3,30


In [None]:
df5.transform("sqrt") 
#df5.apply('sqrt') # gives an error

Unnamed: 0,A,B
0,1.0,3.162278
1,1.414214,4.472136
2,1.732051,5.477226


In [None]:
df5.transform(np.sqrt)
#df5.apply(np.sqrt)

Unnamed: 0,A,B
0,1.0,3.162278
1,1.414214,4.472136
2,1.732051,5.477226


In [None]:
df5.transform(["sqrt","exp"])
#df5.apply([np.sqrt, np.exp])

Unnamed: 0_level_0,A,A,B,B
Unnamed: 0_level_1,sqrt,exp,sqrt,exp
0,1.0,2.718282,3.162278,22026.47
1,1.414214,7.389056,4.472136,485165200.0
2,1.732051,20.085537,5.477226,10686470000000.0


In [None]:
df5.transform({'A': np.sqrt, 'B': np.exp})
#df5.apply({'A': np.sqrt, 'B': np.exp})

Unnamed: 0,A,B
0,1.0,22026.47
1,1.414214,485165200.0
2,1.732051,10686470000000.0


In [None]:
df5.apply(lambda x: x.sum())
#df5.transform(lambda x:x.sum()) # gives an error

A     6
B    60
dtype: int64

In [None]:
df5.apply(lambda x: x["B"]-x["A"], axis=1)
# df5.transform(lambda x: x["B"]-x["A"], axis=1) # gives an error

0     9
1    18
2    27
dtype: int64

**Differences Between .apply() and .transform() when using them in conjunction with groupby()**

transform() returns a DataFrame that has the same length as the input

apply() works with multiple Series at a time. But, transform() is only allowed to work with a single Series at a time.

In [None]:
df6 = pd.DataFrame({'key': ['a','b','c'] * 3,
                    'A': np.arange(9),
                    'B': [1,2,3] * 3})
df6

Unnamed: 0,key,A,B
0,a,0,1
1,b,1,2
2,c,2,3
3,a,3,1
4,b,4,2
5,c,5,3
6,a,6,1
7,b,7,2
8,c,8,3


In [None]:
df6.groupby("key")["A"].sum()

key
a     9
b    12
c    15
Name: A, dtype: int64

In [None]:
df6.groupby("key")["A"].apply(lambda x : x.sum())

key
a     9
b    12
c    15
Name: A, dtype: int64

In [None]:
df6.groupby("key")["A"].transform(lambda x : x.sum())

0     9
1    12
2    15
3     9
4    12
5    15
6     9
7    12
8    15
Name: A, dtype: int64

In [None]:
df6.groupby("key").apply(lambda x : x["B"]-x["A"])
# df6.groupby('key').transform(lambda x: x["B"]-x["A"]) # gives an error because multiple series name in lambda

key   
a    0    1
     3   -2
     6   -5
b    1    1
     4   -2
     7   -5
c    2    1
     5   -2
     8   -5
dtype: int64

# **pivot() vs pivot_table()**

**pivot_table**

pandas.pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All', observed=False, sort=True).

Create a spreadsheet-style pivot table as a DataFrame.

The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame.

**pivot**

DataFrame.pivot(index=None, columns=None, values=None).

Return reshaped DataFrame organized by given index / column values.

Reshape data (produce a “pivot” table) based on column values. Uses unique values from specified index / columns to form axes of the resulting DataFrame.

This function does not support data aggregation, multiple values will result in a MultiIndex in the columns.

**Differences**

Pivot_table is a generalization of pivot that can handle duplicate values for one pivoted index/column pair.

Pivot_table will only allow numeric types as "values=", whereas pivot will take string types as "values=".

In [None]:
data = {'gender':['male', 'female', 'female', 'male', 'female', 'male'],
        'sport':['tennis', 'tennis', 'basketball', 'football', 'voleyball', 'basketball'],
        'status':["professional","professional","professional","amateur","amateur","amateur"],
        'age':[20, 24, 26, 23, 22, 21],
        'height':[185, 172, 175, 178, 182, 196],
        'weight':[83, 58, 62, 80, 65, 90]}

df7 = pd.DataFrame(data)

df7

Unnamed: 0,gender,sport,status,age,height,weight
0,male,tennis,professional,20,185,83
1,female,tennis,professional,24,172,58
2,female,basketball,professional,26,175,62
3,male,football,amateur,23,178,80
4,female,voleyball,amateur,22,182,65
5,male,basketball,amateur,21,196,90


In [None]:
df7.pivot_table(index='gender',
                columns='sport',
                values=['age'],
                aggfunc='mean')

Unnamed: 0_level_0,age,age,age,age
sport,basketball,football,tennis,voleyball
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
female,26.0,,24.0,22.0
male,21.0,23.0,20.0,


In [None]:
df7.pivot(index='gender',
          columns='sport',
          values='status')

sport,basketball,football,tennis,voleyball
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,professional,,professional,amateur
male,amateur,amateur,professional,


In [None]:
df7.pivot_table(index='gender',   #agg str yapamıyor
          columns='sport',
          values='status')

  return self._try_aggregate_string_function(obj, f, *self.args, **self.kwargs)


sport
gender
female
male


In [None]:
df7

Unnamed: 0,gender,sport,status,age,height,weight
0,male,tennis,professional,20,185,83
1,female,tennis,professional,24,172,58
2,female,basketball,professional,26,175,62
3,male,football,amateur,23,178,80
4,female,voleyball,amateur,22,182,65
5,male,basketball,amateur,21,196,90


In [None]:
df7.loc[2,"sport"] = "tennis"
df7

Unnamed: 0,gender,sport,status,age,height,weight
0,male,tennis,professional,20,185,83
1,female,tennis,professional,24,172,58
2,female,tennis,professional,26,175,62
3,male,football,amateur,23,178,80
4,female,voleyball,amateur,22,182,65
5,male,basketball,amateur,21,196,90


In [None]:
df7.pivot_table(index='gender',
                columns='sport',
                values=['age','height','weight'],
                aggfunc='mean')

Unnamed: 0_level_0,age,age,age,age,height,height,height,height,weight,weight,weight,weight
sport,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
female,,,25.0,22.0,,,173.5,182.0,,,60.0,65.0
male,21.0,23.0,20.0,,196.0,178.0,185.0,,90.0,80.0,83.0,


In [None]:
df7.pivot(index='gender',           #Index contains duplicate entries, cannot reshape
                columns='sport',
                values=['age','height','weight'])

ValueError: ignored

# **.stack() & .unstack()**

**stack()**

DataFrame.stack(level=- 1, dropna=True)

Stack the prescribed level(s) from columns to index.

Return a reshaped DataFrame or Series having a multi-level index with one or more new inner-most levels compared to the current DataFrame.

The new inner-most levels are created by pivoting the columns of the current dataframe:

      *  if the columns have a single level, the output is a Series;
      *  if the columns have multiple levels, the new index level(s) is (are) 
         taken from the prescribed level(s) and the output is a DataFrame.

In [161]:
df7

Unnamed: 0,gender,sport,status,age,height,weight
0,male,tennis,professional,20,185,83
1,female,tennis,professional,24,172,58
2,female,tennis,professional,26,175,62
3,male,football,amateur,23,178,80
4,female,voleyball,amateur,22,182,65
5,male,basketball,amateur,21,196,90


In [162]:
df7["level"] = ["high", "high", "low", "high", "low", "low"]
df7

Unnamed: 0,gender,sport,status,age,height,weight,level
0,male,tennis,professional,20,185,83,high
1,female,tennis,professional,24,172,58,high
2,female,tennis,professional,26,175,62,low
3,male,football,amateur,23,178,80,high
4,female,voleyball,amateur,22,182,65,low
5,male,basketball,amateur,21,196,90,low


In [163]:
df8 = df7.pivot_table(index=['gender','sport'],
                columns=["status","level"], 
                values=['age','height','weight'],
                aggfunc='mean')
df8

Unnamed: 0_level_0,Unnamed: 1_level_0,age,age,age,age,height,height,height,height,weight,weight,weight,weight
Unnamed: 0_level_1,status,amateur,amateur,professional,professional,amateur,amateur,professional,professional,amateur,amateur,professional,professional
Unnamed: 0_level_2,level,high,low,high,low,high,low,high,low,high,low,high,low
gender,sport,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
female,tennis,,,24.0,26.0,,,172.0,175.0,,,58.0,62.0
female,voleyball,,22.0,,,,182.0,,,,65.0,,
male,basketball,,21.0,,,,196.0,,,,90.0,,
male,football,23.0,,,,178.0,,,,80.0,,,
male,tennis,,,20.0,,,,185.0,,,,83.0,


In [164]:
df8.stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,age,height,height,weight,weight
Unnamed: 0_level_1,Unnamed: 1_level_1,status,amateur,professional,amateur,professional,amateur,professional
gender,sport,level,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
female,tennis,high,,24.0,,172.0,,58.0
female,tennis,low,,26.0,,175.0,,62.0
female,voleyball,low,22.0,,182.0,,65.0,
male,basketball,low,21.0,,196.0,,90.0,
male,football,high,23.0,,178.0,,80.0,
male,tennis,high,,20.0,,185.0,,83.0


In [168]:
df8.stack(level=1)   #default level = -1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,age,height,height,weight,weight
Unnamed: 0_level_1,Unnamed: 1_level_1,level,high,low,high,low,high,low
gender,sport,status,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
female,tennis,professional,24.0,26.0,172.0,175.0,58.0,62.0
female,voleyball,amateur,,22.0,,182.0,,65.0
male,basketball,amateur,,21.0,,196.0,,90.0
male,football,amateur,23.0,,178.0,,80.0,
male,tennis,professional,20.0,,185.0,,83.0,


In [169]:
df8.stack(level=2)     # try "-1,-2,0,1,2" as level parameter
                       


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,age,age,height,height,weight,weight
Unnamed: 0_level_1,Unnamed: 1_level_1,status,amateur,professional,amateur,professional,amateur,professional
gender,sport,level,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
female,tennis,high,,24.0,,172.0,,58.0
female,tennis,low,,26.0,,175.0,,62.0
female,voleyball,low,22.0,,182.0,,65.0,
male,basketball,low,21.0,,196.0,,90.0,
male,football,high,23.0,,178.0,,80.0,
male,tennis,high,,20.0,,185.0,,83.0


**unstack()**

DataFrame.unstack(level=- 1, fill_value=None)

Pivot a level of the (necessarily hierarchical) index labels.

Returns a DataFrame having a new level of column labels whose inner-most level consists of the pivoted index labels.

If the index is not a MultiIndex, the output will be a Series.

In [170]:
df8

Unnamed: 0_level_0,Unnamed: 1_level_0,age,age,age,age,height,height,height,height,weight,weight,weight,weight
Unnamed: 0_level_1,status,amateur,amateur,professional,professional,amateur,amateur,professional,professional,amateur,amateur,professional,professional
Unnamed: 0_level_2,level,high,low,high,low,high,low,high,low,high,low,high,low
gender,sport,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
female,tennis,,,24.0,26.0,,,172.0,175.0,,,58.0,62.0
female,voleyball,,22.0,,,,182.0,,,,65.0,,
male,basketball,,21.0,,,,196.0,,,,90.0,,
male,football,23.0,,,,178.0,,,,80.0,,,
male,tennis,,,20.0,,,,185.0,,,,83.0,


In [171]:
df8.unstack()

Unnamed: 0_level_0,age,age,age,age,age,age,age,age,age,age,...,weight,weight,weight,weight,weight,weight,weight,weight,weight,weight
status,amateur,amateur,amateur,amateur,amateur,amateur,amateur,amateur,professional,professional,...,amateur,amateur,professional,professional,professional,professional,professional,professional,professional,professional
level,high,high,high,high,low,low,low,low,high,high,...,low,low,high,high,high,high,low,low,low,low
sport,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball,basketball,football,...,tennis,voleyball,basketball,football,tennis,voleyball,basketball,football,tennis,voleyball
gender,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4
female,,,,,,,,22.0,,,...,,65.0,,,58.0,,,,62.0,
male,,23.0,,,21.0,,,,,,...,,,,,83.0,,,,,
