# İleri Toplulaştırma İşlemleri (Aggregate, filter, transform, apply)

In [2]:
import pandas as pd
import numpy as np
df = pd.DataFrame({"gruplar" : ["A","B","C","A","B","C"],
                   "degisken1": [10,23,33,22,11,99],
                   "degisken2": [100,253,333,262,111,969]})
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


## Aggregate

In [3]:
df.groupby("gruplar").mean()

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,16.0,181.0
B,17.0,182.0
C,66.0,651.0


In [4]:
df.groupby("gruplar")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001BED6F39F40>

In [4]:
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


In [5]:
df.groupby("gruplar").aggregate(["min",np.median,max,np.sum]) #["min",np.median,"max"] veya [min,np.median,max] şeklinde de yazılabilir

Unnamed: 0_level_0,degisken1,degisken1,degisken1,degisken1,degisken2,degisken2,degisken2,degisken2
Unnamed: 0_level_1,min,median,max,sum,min,median,max,sum
gruplar,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
A,10,16.0,22,32,100,181.0,262,362
B,11,17.0,23,34,111,182.0,253,364
C,33,66.0,99,132,333,651.0,969,1302


In [6]:
df.groupby("gruplar").aggregate({"degisken1" :"min","degisken2":"max"})

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,10,262
B,11,253
C,33,969


## Filter

In [7]:
def filter_func(x):
    return x["degisken1"].std() > 9

In [8]:
df.groupby("gruplar").std()

Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,8.485281,114.551299
B,8.485281,100.409163
C,46.669048,449.719913


In [9]:
df.groupby("gruplar").filter(filter_func)

Unnamed: 0,gruplar,degisken1,degisken2
2,C,33,333
5,C,99,969


## Transform

In [10]:
df

Unnamed: 0,gruplar,degisken1,degisken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


In [15]:
df_a = df.iloc[:,1:3]  # bunu yapmazsak hata veriyor çünkü bunu yapmazsak gruplar kateggorisini çıkarmamış oluyoruz

In [17]:
df_a.transform(lambda x: x-x.mean())

Unnamed: 0,degisken1,degisken2
0,-23.0,-238.0
1,-10.0,-85.0
2,0.0,-5.0
3,-11.0,-76.0
4,-22.0,-227.0
5,66.0,631.0


In [18]:
df_a.transform(lambda x: (x-x.mean())/x.std())

Unnamed: 0,degisken1,degisken2
0,-0.687871,-0.738461
1,-0.299074,-0.263736
2,0.0,-0.015514
3,-0.328982,-0.235811
4,-0.657963,-0.704331
5,1.97389,1.957853


## Apply

In [19]:
import pandas as pd
import numpy as np
df = pd.DataFrame({"degisken1": [10,23,33,22,11,99],
                   "degisken2": [100,253,333,262,111,969]})
df

Unnamed: 0,degisken1,degisken2
0,10,100
1,23,253
2,33,333
3,22,262
4,11,111
5,99,969


In [20]:
df.apply(np.sum)

degisken1     198
degisken2    2028
dtype: int64

In [21]:
df.apply(np.mean)

degisken1     33.0
degisken2    338.0
dtype: float64

In [26]:
import pandas as pd
import numpy as np
df = pd.DataFrame({"gruplar" : ["A","B","C","A","B","C"],
                   "degisken1": [10,23,33,22,11,99],
                   "degisken2": [100,253,333,262,111,969]})

In [27]:
df.groupby("gruplar").apply(np.mean)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)
  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


Unnamed: 0_level_0,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,16.0,181.0
B,17.0,182.0
C,66.0,651.0


In [28]:
df.groupby("gruplar").apply(np.sum)

Unnamed: 0_level_0,gruplar,degisken1,degisken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,AA,32,362
B,BB,34,364
C,CC,132,1302


## Pivot Tablolar

In [30]:
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [45]:
titanic.groupby("sex")[["survived"]].mean() #köşeli parantez kullanınca daha düzgün bir tablo oluyor

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [43]:
titanic.groupby(["sex","class"])[["survived"]].aggregate("mean").unstack()  #unstack görünümünü değiştiriyor yazmasakta olur

Unnamed: 0_level_0,survived,survived,survived
class,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [46]:
# pivot ile table

In [47]:
titanic.pivot_table("survived",index="sex",columns="class")

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [53]:
age = pd.cut(titanic["age"],[0,18,90])
age.head(10)

0    (18.0, 90.0]
1    (18.0, 90.0]
2    (18.0, 90.0]
3    (18.0, 90.0]
4    (18.0, 90.0]
5             NaN
6    (18.0, 90.0]
7     (0.0, 18.0]
8    (18.0, 90.0]
9     (0.0, 18.0]
Name: age, dtype: category
Categories (2, interval[int64, right]): [(0, 18] < (18, 90]]

In [51]:
titanic.pivot_table("survived",["sex",age],"class")

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 90]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 90]",0.375,0.071429,0.133663
