In [1]:
import pandas as pd
import numpy as np

In [2]:
brics = pd.read_csv("https://raw.githubusercontent.com/MohamedMostafa259/Pandas-Notes/refs/heads/main/Data/brics.csv", index_col=0) # OR: index_col=["col0_name"]
brics

Unnamed: 0,country,capital,area,population
BR,Brazil,Brasilia,8.516,200.4
RU,Russia,Moscow,17.1,143.5
IN,India,New Delhi,3.286,1252.0
CH,China,Beijing,9.597,1357.0
SA,South Africa,Pretoria,1.221,52.98


In [3]:
sales = pd.read_csv("https://raw.githubusercontent.com/MohamedMostafa259/Pandas-Notes/refs/heads/main/Data/sales_sample.csv")
sales # last 3 rows are the same

Unnamed: 0,store,type,department,date,weekly_sales,is_holiday,temperature_c,fuel_price_usd_per_l,unemployment
0,1,A,1,2010-02-05,24924.5,False,5.728,0.679,8.106
1,2,A,1,2010-03-05,21827.9,False,8.056,0.693,8.106
2,3,B,1,2010-04-02,57258.43,True,16.817,0.718,7.808
3,4,B,2,2010-05-07,17413.94,False,22.528,0.749,7.808
4,3,B,2,2010-06-04,17558.09,False,27.05,0.715,7.808
5,6,C,2,2010-06-04,17558.09,True,27.05,0.715,7.808
6,7,C,3,2010-06-04,17558.09,False,27.05,0.715,7.808
7,7,C,3,2010-06-04,17558.09,False,27.05,0.715,7.808
8,7,C,3,2010-06-04,17558.09,False,27.05,0.715,7.808


In [4]:
books = pd.read_csv("https://raw.githubusercontent.com/MohamedMostafa259/Pandas-Notes/refs/heads/main/Data/books.csv")
books.head()

Unnamed: 0,name,author,rating,year,genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,2019,Childrens


In [5]:
salaries = pd.read_csv("https://raw.githubusercontent.com/MohamedMostafa259/Pandas-Notes/refs/heads/main/Data/salaries.csv")
salaries.head()

Unnamed: 0,Working_Year,Designation,Experience,Employment_Status,Employee_Location,Company_Size,Remote_Working_Ratio,Salary_USD
0,2020,Data Scientist,Mid,FT,DE,L,0,76227.0
1,2020,Machine Learning Scientist,Senior,FT,JP,S,0,248257.0
2,2020,Big Data Engineer,Senior,FT,GB,M,50,104100.0
3,2020,Product Data Analyst,Mid,FT,HN,S,0,19097.0
4,2020,Machine Learning Engineer,Senior,FT,US,L,50,143225.0


# Single Summary Statistics

In [6]:
print("mean:", brics["population"].mean())
print("median:", brics["population"].median())
print("mode:", brics["population"].mode())
print("standard deviation:", brics["population"].std())
print("variance:", brics["population"].var())
print("min:", brics["population"].min())
print("max:", brics["population"].max())
print("sum:", brics["population"].sum())
print("count:", brics["population"].count()) # `.count()` gives you the number of non-missing values
print("1st quartile:", brics["population"].quantile(0.25)) # computes the 25th percentile of the "population" col
brics.describe()

# `.first()`: returns the 1st row, `last()`: returns the last row, 
# `.size()`: returns the total number of elements, including any missing values

mean: 601.176
median: 200.4
mode: 0      52.98
1     143.50
2     200.40
3    1252.00
4    1357.00
Name: population, dtype: float64
standard deviation: 645.2614538929162
variance: 416362.34388
min: 52.98
max: 1357.0
sum: 3005.88
count: 5
1st quartile: 143.5


Unnamed: 0,area,population
count,5.0,5.0
mean,7.944,601.176
std,6.200557,645.261454
min,1.221,52.98
25%,3.286,143.5
50%,8.516,200.4
75%,9.597,1252.0
max,17.1,1357.0


# Multiple Summary Statistics

In [7]:
def pct25(col):
	return col.quantile(0.25)
def pct75(col):
	return col.quantile(0.75)
def IQR(col):
	return pct75(col) - pct25(col)

# either agg() or aggregate() allows you to compute aggregated and custom summary statistics 
# they can take function pointers (function name) OR lambda functions 
print(brics["population"].agg(pct25), "\n") # returns a value , equivalent => brics["population"].quantile(0.25)
print(brics[["population", "area"]].agg(pct25), '\n') # returns Series , equivalent => brics[["area", "population"]].quantile(0.25)

# Here is the real power of agg() and aggregate() functions => Multiple Summary Statistics
print(brics["population"].agg([lambda x: x.quantile(0.25), lambda x: x.quantile(0.75)]), '\n') # returns Series
print(brics[["area", "population"]].agg([pct25, pct75, IQR, np.mean])) # returns DataFrame

143.5 

population    143.500
area            3.286
dtype: float64 

<lambda>     143.5
<lambda>    1252.0
Name: population, dtype: float64 

        area  population
pct25  3.286     143.500
pct75  9.597    1252.000
IQR    6.311    1108.500
mean   7.944     601.176


  print(brics[["area", "population"]].agg([pct25, pct75, IQR, np.mean])) # returns DataFrame
  print(brics[["area", "population"]].agg([pct25, pct75, IQR, np.mean])) # returns DataFrame


# Cumulative Summary Statistics

In [8]:
print(brics["population"], '\n')
brics["population"].cumsum() # returns Series
# There are also other functions, like cummax(), cummin(), and cumprod()

BR     200.40
RU     143.50
IN    1252.00
CH    1357.00
SA      52.98
Name: population, dtype: float64 



BR     200.40
RU     343.90
IN    1595.90
CH    2952.90
SA    3005.88
Name: population, dtype: float64

# Grouped Summary Statistics
Think of a `DataFrameGroupBy` object as a collection of smaller DataFrames, each corresponding to a group defined by the `groupby` operation. Each of these smaller DataFrames contains the subset of rows from the original DataFrame that belong to that group.

In [9]:
# What if we want to get weekly_sales mean of each type
# Idea1: verbose way :(
print(sales[sales["type"] == "A"]["weekly_sales"].mean())
print(sales[sales["type"] == "B"]["weekly_sales"].mean())
print(sales[sales["type"] == "C"]["weekly_sales"].mean())

23376.2
30743.486666666664
17558.09


In [10]:
# Idea2: let's group by type!
grouped_type = sales.groupby(["type"]) # Returns a groupby object (DataFrameGroupBy) that contains information about the groups.
									   # grouped_type["weekly_sales"] returns SeriesGroupBy object

for group_name, group_df in grouped_type:
	print(f"Group: {group_name}")
	print(group_df, '\n')

# imagine `DataFrameGroupBy` object like the table below
"""
		Column1     Column2     Column3
Group1  Series11    Series12    Series13    => Series11 + Series12 + Series13 = subset of the original DataFrame contains Group1

Group2  ...         ...         ...         => ...

Group3  ...         ...         ...         => ...
"""

Group: ('A',)
   store type  department        date  weekly_sales  is_holiday  \
0      1    A           1  2010-02-05       24924.5       False   
1      2    A           1  2010-03-05       21827.9       False   

   temperature_c  fuel_price_usd_per_l  unemployment  
0          5.728                 0.679         8.106  
1          8.056                 0.693         8.106   

Group: ('B',)
   store type  department        date  weekly_sales  is_holiday  \
2      3    B           1  2010-04-02      57258.43        True   
3      4    B           2  2010-05-07      17413.94       False   
4      3    B           2  2010-06-04      17558.09       False   

   temperature_c  fuel_price_usd_per_l  unemployment  
2         16.817                 0.718         7.808  
3         22.528                 0.749         7.808  
4         27.050                 0.715         7.808   

Group: ('C',)
   store type  department        date  weekly_sales  is_holiday  \
5      6    C           2  2010

'\n\t\tColumn1     Column2     Column3\nGroup1  Series11    Series12    Series13    => Series11 + Series12 + Series13 = subset of the original DataFrame contains Group1\n\nGroup2  ...         ...         ...         => ...\n\nGroup3  ...         ...         ...         => ...\n'

In [11]:
print(grouped_type["weekly_sales"].mean(), '\n')
print(grouped_type[["weekly_sales", "temperature_c"]].mean(), '\n')
print(sales.groupby(["type", "department"])[["weekly_sales", "temperature_c"]].mean(), '\n')
print(sales.groupby(["type", "department"])[["weekly_sales", "temperature_c"]].agg([max, min, sum]), '\n')

type
A    23376.200000
B    30743.486667
C    17558.090000
Name: weekly_sales, dtype: float64 

      weekly_sales  temperature_c
type                             
A     23376.200000       6.892000
B     30743.486667      22.131667
C     17558.090000      27.050000 

                 weekly_sales  temperature_c
type department                             
A    1              23376.200          6.892
B    1              57258.430         16.817
     2              17486.015         24.789
C    2              17558.090         27.050
     3              17558.090         27.050 

                weekly_sales                     temperature_c                
                         max       min       sum           max     min     sum
type department                                                               
A    1              24924.50  21827.90  46752.40         8.056   5.728  13.784
B    1              57258.43  57258.43  57258.43        16.817  16.817  16.817
     2              

  print(sales.groupby(["type", "department"])[["weekly_sales", "temperature_c"]].agg([max, min, sum]), '\n')
  print(sales.groupby(["type", "department"])[["weekly_sales", "temperature_c"]].agg([max, min, sum]), '\n')
  print(sales.groupby(["type", "department"])[["weekly_sales", "temperature_c"]].agg([max, min, sum]), '\n')
  print(sales.groupby(["type", "department"])[["weekly_sales", "temperature_c"]].agg([max, min, sum]), '\n')


### Access Values From A DataFrameGroupBy Object

In [12]:
group_A = grouped_type.get_group('A') # returns DataFrame
sales_A = group_A["weekly_sales"].sum()
sales_A # sales_A = grouped_type.get_group('A')["weekly_sales"].sum()

  group_A = grouped_type.get_group('A') # returns DataFrame


46752.4

### Explanation of the `as_index` Argument in the `groupby` Method

In [13]:
# multi-level index
print(sales.groupby(["type", "department"])["weekly_sales"].mean(), '\n\n')

# 'type' and 'department' will be treated as regular columns 
# convenient when you want to perform further operations on these columns or when you prefer a flat DataFrame structure
print(sales.groupby(["type", "department"], as_index=False)["weekly_sales"].mean())

type  department
A     1             23376.200
B     1             57258.430
      2             17486.015
C     2             17558.090
      3             17558.090
Name: weekly_sales, dtype: float64 


  type  department  weekly_sales
0    A           1     23376.200
1    B           1     57258.430
2    B           2     17486.015
3    C           2     17558.090
4    C           3     17558.090


### Specifying Aggregation with Columns

In [14]:
# normal use of `.agg()` is, for example, `.agg([np.mean, min, max])`, 
# but we may want to specify as, for instance, categorical data doesn't have a mean value
books.groupby("genre").agg({"rating":["mean", "std"], "year":["median"]})

Unnamed: 0_level_0,rating,rating,year
Unnamed: 0_level_1,mean,std,median
genre,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Childrens,4.78,0.12237,2015.0
Fiction,4.570229,0.281123,2013.0
Non Fiction,4.598324,0.179411,2013.0


In [15]:
# We can create named columns with our desired aggregations by using the dot-agg function and creating named tuples inside it. 
# Each named tuple should include a column name followed by the aggregating function to apply to that column. 
# The name of the tuple becomes the name of the resulting column.
books.groupby("genre").agg(
	mean_rating = ("rating", "mean"),
	std_rating = ("rating", "std"),
	median_year = ("year", "median")
	)

Unnamed: 0_level_0,mean_rating,std_rating,median_year
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Childrens,4.78,0.12237,2015.0
Fiction,4.570229,0.281123,2013.0
Non Fiction,4.598324,0.179411,2013.0


### Adding Summary Statistics into a DataFrame
Sometimes, we might prefer to add summary statistics directly into our DataFrame, rather than creating a summary table (e.g., pivot table)

In [16]:
# Let's say we would like to create a new column containing the standard deviation of Salary_USD, 
# where values are conditional based on the Experience column.

# Note that the `x` parameter in the `transform()` function is a Series
# `transform()` returns a Series having the same indexes as the original object filled with the transformed values
salaries["std_dev"] = salaries.groupby("Experience")["Salary_USD"].transform(lambda x: x.std())
salaries[["Experience", "std_dev"]].value_counts()

Experience  std_dev      
Senior      55086.264071     280
Mid         61014.876754     213
Entry       42390.355189      88
Executive   111783.576779     26
Name: count, dtype: int64

## Pivot Tables
groupby is generally faster and more memory-efficient for simple aggregations and transformations because it performs direct operations on the DataFrame without the additional overhead of creating a pivot table structure.

→ Pivot Tables are sorted DataFrames on the index and columns
-   Result: You can use .loc[] + slicing on them (For more explanation, see ***Slicing DataFrames PART 2*** cells in this notebook)
	-	Edit: I forgot where I wrote these cells!!

In [17]:
# sales.groupby("type")["weekly_sales"].mean()
# 'index' and 'columns' arguments are the variables that you want to group by WHILE 'values' arg is the variables that you want to summarize (aggregate)
# index → displays columns to group by in rows   -   columns → displays columns to group by in columns
sales.pivot_table(index="type", values="weekly_sales") # by default, it calculates the mean value of each group -- Returns DataFrame

Unnamed: 0_level_0,weekly_sales
type,Unnamed: 1_level_1
A,23376.2
B,30743.486667
C,17558.09


In [18]:
# Grouping → using multi-level index
sales.pivot_table(index=["type", "department"], values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum])

  sales.pivot_table(index=["type", "department"], values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum])
  sales.pivot_table(index=["type", "department"], values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum])
  sales.pivot_table(index=["type", "department"], values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum])


Unnamed: 0_level_0,Unnamed: 1_level_0,min,min,max,max,sum,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,temperature_c,weekly_sales,temperature_c,weekly_sales,temperature_c,weekly_sales
type,department,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
A,1,5.728,21827.9,8.056,24924.5,13.784,46752.4
B,1,16.817,57258.43,16.817,57258.43,16.817,57258.43
B,2,22.528,17413.94,27.05,17558.09,49.578,34972.03
C,2,27.05,17558.09,27.05,17558.09,27.05,17558.09
C,3,27.05,17558.09,27.05,17558.09,81.15,52674.27


In [19]:
# Grouping → using index and columns
# To group by two variables, we can pass the 2nd variable name into the 'columns' argument
sales_type_vs_dept_pvt = sales.pivot_table(index="type", columns="department", values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum])
# Equivalent to: sales.groupby(["type", "department"])[["weekly_sales", "temperature_c"]].agg([min, max, sum]).unstack("department")
# `.unstack()` reshapes the DataFrame so that 'department' values become columns, similar to the pivot_table output.
sales_type_vs_dept_pvt

  sales_type_vs_dept_pvt = sales.pivot_table(index="type", columns="department", values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum])
  sales_type_vs_dept_pvt = sales.pivot_table(index="type", columns="department", values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum])
  sales_type_vs_dept_pvt = sales.pivot_table(index="type", columns="department", values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum])


Unnamed: 0_level_0,min,min,min,min,min,min,max,max,max,max,max,max,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,temperature_c,temperature_c,temperature_c,weekly_sales,weekly_sales,weekly_sales,temperature_c,temperature_c,temperature_c,weekly_sales,weekly_sales,weekly_sales,temperature_c,temperature_c,temperature_c,weekly_sales,weekly_sales,weekly_sales
department,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3
type,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3
A,5.728,,,21827.9,,,8.056,,,24924.5,,,13.784,,,46752.4,,
B,16.817,22.528,,57258.43,17413.94,,16.817,27.05,,57258.43,17558.09,,16.817,49.578,,57258.43,34972.03,
C,,27.05,27.05,,17558.09,17558.09,,27.05,27.05,,17558.09,17558.09,,27.05,81.15,,17558.09,52674.27


In [20]:
# 'fill_value' argument is used to fill NaNs by a certain value (fill_value=0 →→→ fill missing values with 0s)
# 'margins=True' creates an additional row and column to calculate the summary statistic function of the entire dataset w/o considering NaNs into calculations
sales.pivot_table(index="type", columns="department", values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum], fill_value=0, margins=True)

  sales.pivot_table(index="type", columns="department", values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum], fill_value=0, margins=True)
  sales.pivot_table(index="type", columns="department", values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum], fill_value=0, margins=True)
  sales.pivot_table(index="type", columns="department", values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum], fill_value=0, margins=True)
  sales.pivot_table(index="type", columns="department", values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum], fill_value=0, margins=True)
  sales.pivot_table(index="type", columns="department", values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum], fill_value=0, margins=True)
  sales.pivot_table(index="type", columns="department", values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum], fill_value=0, margins=True)
  sales.pivot_table(index="type", columns="department", values=["weekly_sales", "temperature_c

Unnamed: 0_level_0,min,min,min,min,min,min,min,min,max,max,max,max,max,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,temperature_c,temperature_c,temperature_c,temperature_c,weekly_sales,weekly_sales,weekly_sales,weekly_sales,temperature_c,temperature_c,...,weekly_sales,weekly_sales,temperature_c,temperature_c,temperature_c,temperature_c,weekly_sales,weekly_sales,weekly_sales,weekly_sales
department,1,2,3,All,1,2,3,All,1,2,...,3,All,1,2,3,All,1,2,3,All
type,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
A,5.728,0.0,0.0,5.728,21827.9,0.0,0.0,21827.9,8.056,0.0,...,0.0,24924.5,13.784,0.0,0.0,13.784,46752.4,0.0,0.0,46752.4
B,16.817,22.528,0.0,16.817,57258.43,17413.94,0.0,17413.94,16.817,27.05,...,0.0,57258.43,16.817,49.578,0.0,66.395,57258.43,34972.03,0.0,92230.46
C,0.0,27.05,27.05,27.05,0.0,17558.09,17558.09,17558.09,0.0,27.05,...,17558.09,17558.09,0.0,27.05,81.15,108.2,0.0,17558.09,52674.27,70232.36
All,5.728,22.528,27.05,5.728,21827.9,17413.94,17558.09,17413.94,16.817,27.05,...,17558.09,57258.43,30.601,76.628,81.15,188.379,104010.83,52530.12,52674.27,209215.22


**The choice between using index and columns versus a multi-level index depends on how you want to organize and display your data ↑↑**

In [21]:
%timeit sales.groupby(["type", "department"])[["weekly_sales", "temperature_c"]].agg([max, min, sum])
%timeit sales.pivot_table(index=["type", "department"], values=["weekly_sales", "temperature_c"], aggfunc=[min, max, sum])



4.22 ms ± 558 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)




16.1 ms ± 2.92 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)




In [22]:
import time

# Create a large dataset
np.random.seed(0) # to ensure reducibility
data = pd.DataFrame({
	'type': np.random.choice(['A', 'B', 'C'], size=1000000),
	'weekly_sales': np.random.rand(1000000) * 1000 # Generates an array of 1,000,000 random numbers between 0 and 1000
})

# Using groupby
start_time = time.time() # records current time
sales_grouped = data.groupby("type")["weekly_sales"].sum()
groupby_time = time.time() - start_time

# Using pivot_table
start_time = time.time() # records current time
sales_pivot = data.pivot_table(values="weekly_sales", index="type", aggfunc="sum")
pivot_time = time.time() - start_time

print(f"Groupby time: {groupby_time:.4f} seconds")
print(f"Pivot table time: {pivot_time:.4f} seconds")

Groupby time: 0.0488 seconds
Pivot table time: 0.0510 seconds


#### .loc[] + slicing on pivot tables (Remember that Pivot Tables are sorted DataFrames on indexes and columns)

In [23]:
sales_type_vs_dept_pvt.loc["A":"B"]

Unnamed: 0_level_0,min,min,min,min,min,min,max,max,max,max,max,max,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,temperature_c,temperature_c,temperature_c,weekly_sales,weekly_sales,weekly_sales,temperature_c,temperature_c,temperature_c,weekly_sales,weekly_sales,weekly_sales,temperature_c,temperature_c,temperature_c,weekly_sales,weekly_sales,weekly_sales
department,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3
type,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3
A,5.728,,,21827.9,,,8.056,,,24924.5,,,13.784,,,46752.4,,
B,16.817,22.528,,57258.43,17413.94,,16.817,27.05,,57258.43,17558.09,,16.817,49.578,,57258.43,34972.03,


##### Calculating Summary Statistics along Different Axes
1.  `sales_type_vs_dept_pvt.mean(axis="index")`
	-   This line calculates the mean along the rows (index) of the pivot table.
	-   Each column in the pivot table will have its mean value calculated.
	-   The result is a Series where each value represents the mean of the values in the corresponding column of the pivot table.

2.  `sales_type_vs_dept_pvt.mean(axis="columns")`
	-   This line calculates the mean along the columns of the pivot table.
	-   Each row in the pivot table will have its mean value calculated.
	-   The result is a Series where each value represents the mean of the values in the corresponding row of the pivot table.

→ along the rows = across the rows <br>
→ along the columns = across the columns

**N.B.** For most DataFrames, setting the axis argument doesn't make any sense, since you'll have different data types in each column. Pivot tables are a special case since every column contains the same data type.

In [24]:
sales_type_vs_dept_pvt

Unnamed: 0_level_0,min,min,min,min,min,min,max,max,max,max,max,max,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,temperature_c,temperature_c,temperature_c,weekly_sales,weekly_sales,weekly_sales,temperature_c,temperature_c,temperature_c,weekly_sales,weekly_sales,weekly_sales,temperature_c,temperature_c,temperature_c,weekly_sales,weekly_sales,weekly_sales
department,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3,1,2,3
type,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3
A,5.728,,,21827.9,,,8.056,,,24924.5,,,13.784,,,46752.4,,
B,16.817,22.528,,57258.43,17413.94,,16.817,27.05,,57258.43,17558.09,,16.817,49.578,,57258.43,34972.03,
C,,27.05,27.05,,17558.09,17558.09,,27.05,27.05,,17558.09,17558.09,,27.05,81.15,,17558.09,52674.27


In [25]:
# The methods for calculating summary statistics on DataFrames, such as mean(), median(), ..., have an 'axis' argument.
print(sales_type_vs_dept_pvt.mean(), '\n') # `axis="index"` OR `axis=0`, by default 
sales_type_vs_dept_pvt.mean(axis=1) # OR: `axis=1`
# sales_type_vs_dept_pvt should be named: weeklySales_and_temperature_c_by_type_vs_dept_pvt →→ valuesToAggregate_by_indexes_vs_columns_pvt

                    department
min  temperature_c  1                11.2725
                    2                24.7890
                    3                27.0500
     weekly_sales   1             39543.1650
                    2             17486.0150
                    3             17558.0900
max  temperature_c  1                12.4365
                    2                27.0500
                    3                27.0500
     weekly_sales   1             41091.4650
                    2             17558.0900
                    3             17558.0900
sum  temperature_c  1                15.3005
                    2                38.3140
                    3                81.1500
     weekly_sales   1             52005.4150
                    2             26265.0600
                    3             52674.2700
dtype: float64 



type
A    15588.728000
B    20155.746417
C    11723.426667
dtype: float64