<a href="https://colab.research.google.com/github/NandiniBasdwar7/concise/blob/main/day11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#GroupBy and Aggregations
#Groupby()-he DataFrame.groupby() method in Pandas is a powerful tool for analyzing and manipulating data by grouping rows or columns based on one or more criteria. It is often likened to the GROUP BY clause in SQL. This function facilitates the "split-apply-combine" strategy for data analysis
import pandas as pd
import numpy as np

# Sample DataFrame for demonstration
data = {
    'Category': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B'],
    'SubCategory': ['X', 'Y', 'X', 'Z', 'Y', 'W', 'Z', 'W'],
    'Value1': [10, 20, 15, 25, 30, 12,22, 28 ],
    'Value2': [1, 2, 1, 3, 2, 4, 3, 4]
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\n")


Original DataFrame:
  Category SubCategory  Value1  Value2
0        A           X      10       1
1        B           Y      20       2
2        A           X      15       1
3        C           Z      25       3
4        B           Y      30       2
5        A           W      12       4
6        C           Z      22       3
7        B           W      28       4




In [3]:
#by parameter-Grouping by a single column: Data is grouped by the unique values in the 'Category' column, and then the sum of 'Value1' is calculated for each category.
grouped_single = df.groupby('Category')['Value1'].sum()
print("Grouped by 'Category':")
print(grouped_single)
print("\n")

Grouped by 'Category':
Category
A    37
B    78
C    47
Name: Value1, dtype: int64




In [19]:
#Grouping by multiple columns

grouped_multiple = df.groupby([ 'SubCategory','Category'])['Value2'].mean()
print("Grouped by 'Category' and 'SubCategory':")
print(grouped_multiple)
print("\n")


Grouped by 'Category' and 'SubCategory':
SubCategory  Category
W            A           4.0
             B           4.0
X            A           1.0
Y            B           2.0
Z            C           3.0
Name: Value2, dtype: float64




In [22]:
import pandas as pd

data = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
df = pd.DataFrame(data)

# Group by rows (default axis=0), then calculate the sum of each group
grouped_rows = df.groupby('col1',axis=0).sum()  # This will group by index
print(grouped_rows)


      col2
col1      
1        4
2        5
3        6


  grouped_rows = df.groupby('col1',axis=0).sum()  # This will group by index


In [23]:
#level-The level parameter is used when working with a MultiIndex (hierarchical index) DataFrame. It allows you to group data based on one or more specific levels of the MultiIndex.
import pandas as pd

# Create a MultiIndex DataFrame
index = pd.MultiIndex.from_arrays([['East', 'East', 'West', 'West'],
                                    ['New York', 'Boston', 'Los Angeles', 'Seattle']],
                                    names=['Region', 'City'])
data = {'Sales': [100, 150, 200, 120]}
df_multiindex = pd.DataFrame(data, index=index)

# Group by the 'Region' level of the MultiIndex (level 0)
grouped_by_region = df_multiindex.groupby(level='Region')['Sales'].sum()
print(grouped_by_region)


Region
East    250
West    320
Name: Sales, dtype: int64


In [25]:
#as_index-This boolean parameter controls whether the group keys should be set as the index of the resulting DataFrame (True by default)
import pandas as pd

data = {'Category': ['A', 'B', 'A', 'C', 'B'],
        'Value': [10, 20, 15, 25, 30]}
df = pd.DataFrame(data)

grouped_data = df.groupby('Category')['Value'].sum()
print(grouped_data)

Category
A    25
B    50
C    25
Name: Value, dtype: int64


In [28]:
import pandas as pd

data = {'Category': ['A', 'B', 'A', 'C', 'B'],
        'Value': [10, 20, 15, 25, 30]}
df = pd.DataFrame(data)

# Using group_keys=True (default) with apply
grouped_with_keys = df.groupby('Category', group_keys=True).apply(lambda x: x['Value'] * 2)
print(grouped_with_keys)

# Using group_keys=False
grouped_without_keys = df.groupby('Category', group_keys=False).apply(lambda x: x['Value'] * 2)
print(grouped_without_keys)

Category   
A         0    20
          2    30
B         1    40
          4    60
C         3    50
Name: Value, dtype: int64
0    20
2    30
1    40
4    60
3    50
Name: Value, dtype: int64


  grouped_with_keys = df.groupby('Category', group_keys=True).apply(lambda x: x['Value'] * 2)
  grouped_without_keys = df.groupby('Category', group_keys=False).apply(lambda x: x['Value'] * 2)


In [29]:
#Basic Aggregation:-A single aggregation function can be applied to a DataFrame or Series.
import pandas as pd

data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)

# Calculate the sum of each column
result = df.agg('sum')
print(result)

A     6
B    15
dtype: int64


In [32]:
#Applying Multiple Functions:-Multiple functions can be applied at once by passing a list of function names to agg().
import pandas as pd

data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)
result=df.agg(['sum','mean'])
print(result)

        A     B
sum   6.0  15.0
mean  2.0   5.0


In [34]:
#Different Aggregations for Different Columns:
import pandas as pd

data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)
result=df.agg({'A':'sum',"B":'mean'})
print(result)

A    6.0
B    5.0
dtype: float64


In [37]:
#Custom Aggregation Functions
import pandas as pd

data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)
def custom_sum(x):
    return x.max()-x.min()
result=df.agg({'A':custom_sum,"B":'mean'})
print(result)

A    2.0
B    5.0
dtype: float64
