## Aggregation

Агрегирование данных - это сбор информации из баз данных с целью подготовки комбинированных наборов данных для обработки данных.

In [1]:
import numpy as np
import pandas as pd

### Built-in aggregation methods in Pandas
An aggregation method takes a Series of values and returns a single value.

In [2]:
numbers = pd.Series(np.random.randint(low=1, high=100, size=5))
numbers

0    85
1    45
2    94
3    18
4    63
dtype: int32

Some of Pandas aggregation methods are:

In [9]:
print(f"{numbers.min() =}")
print(f"{numbers.max() =}")
print(f"{numbers.mean() =}")
print(f"{numbers.median() =}")
print(f"{numbers.mode() =}")
print(f"{numbers.size =}")

numbers.min() =18
numbers.max() =94
numbers.mean() =61.0
numbers.median() =63.0
numbers.mode() =0    18
1    45
2    63
3    85
4    94
dtype: int32
numbers.size =5


In [8]:
print(f"{numbers.mode()[0] =}")

numbers.mode()[0] =18


In [6]:
print(f"{numbers.sum() =}")
print(f"{numbers.count() =}")
print(f"{numbers.sum() / numbers.count() =}")

numbers.sum() =305
numbers.count() =5
numbers.sum() / numbers.count() =61.0


When run on a dataframe (multiple Series) they return a single value for each series, forming a new Series

In [11]:
numbers_df = pd.DataFrame(np.random.randint(low=1, high=100, size=[5,5]))
numbers_df

Unnamed: 0,0,1,2,3,4
0,85,1,82,93,48
1,16,80,15,25,12
2,5,63,39,71,3
3,21,30,13,36,23
4,83,96,58,70,9


In [12]:
numbers_df.min() #min для каждой колонки, default axis = "index"

0     5
1     1
2    13
3    25
4     3
dtype: int32

In [13]:
numbers_df.min(axis="columns") # min для каждого ряда

0     1
1    12
2     3
3    13
4     9
dtype: int32

In [14]:
numbers_df.min().min() # in all the data frame

1

In [16]:
numbers_df.median(axis="columns")

0    82.0
1    16.0
2    39.0
3    23.0
4    70.0
dtype: float64

In [17]:
numbers_df.isna() # наличие н/а

Unnamed: 0,0,1,2,3,4
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False


Count null values in each column:

In [19]:
numbers_df.isna().sum()

0    0
1    0
2    0
3    0
4    0
dtype: int64

In [20]:
numbers[3] = np.NAN
numbers

0    85.0
1    45.0
2    94.0
3     NaN
4    63.0
dtype: float64

In [22]:
print (f"{numbers.size =}")
print (f"{numbers.count() =}")

numbers.size =5
numbers.count() =4


In [23]:
numbers.isna()

0    False
1    False
2    False
3     True
4    False
dtype: bool

In [24]:
numbers.isna().sum()

1