# Slicing and Dicing Dataframes

You have seen how to do indexing of dataframes using ```df.iloc``` and ```df.loc```. Now, let's see how to subset dataframes based on certain conditions. 


In [2]:
# loading libraries and reading the data
import numpy as np
import pandas as pd

df = pd.read_csv("global_sales_data/market_fact.csv")
df.head()

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
0,Ord_5446,Prod_16,SHP_7609,Cust_1818,136.81,0.01,23,-30.51,3.6,0.56
1,Ord_5406,Prod_13,SHP_7549,Cust_1818,42.27,0.01,13,4.56,0.93,0.54
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.69,0.0,26,1148.9,2.5,0.59
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.89,0.09,43,729.34,14.3,0.37
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.15,0.08,35,1219.87,26.3,0.38


### Subsetting Rows Based on Conditions

Often, you want to select rows which satisfy some given conditions. For e.g., select all the orders where the ```Sales > 3000```, or all the orders where ```2000 < Sales < 3000``` and ```Profit < 100```.

Arguably, the best way to do these operations is using ```df.loc[]```, since ```df.iloc[]``` would require you to remember the integer column indices, which is tedious.

Let's see some examples.

In [11]:
# Select all rows where Sales > 3000
# First, we get a boolean array where True corresponds to rows having Sales > 3000
df.Sales > 3000

0       False
1       False
2        True
3       False
4        True
5       False
6       False
7        True
8       False
9       False
10       True
11      False
12      False
13       True
14      False
15      False
16      False
17       True
18      False
19      False
20      False
21      False
22       True
23      False
24      False
25      False
26      False
27      False
28       True
29      False
        ...  
8369    False
8370    False
8371     True
8372    False
8373    False
8374    False
8375    False
8376    False
8377    False
8378    False
8379    False
8380    False
8381    False
8382    False
8383     True
8384    False
8385    False
8386    False
8387    False
8388    False
8389    False
8390    False
8391    False
8392    False
8393    False
8394    False
8395    False
8396    False
8397     True
8398    False
Name: Sales, Length: 8399, dtype: bool

In [12]:
# Then, we pass this boolean array inside df.loc
df.loc[df.Sales > 3000]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.6900,0.00,26,1148.90,2.50,0.59
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.1500,0.08,35,1219.87,26.30,0.38
7,Ord_4725,Prod_4,SHP_6593,Cust_1641,3410.1575,0.10,48,1137.91,0.99,0.55
10,Ord_4743,Prod_2,SHP_6615,Cust_1641,4072.0100,0.01,43,1675.98,0.99,0.56
13,Ord_2207,Prod_11,SHP_3093,Cust_839,3364.2480,0.10,15,-693.23,61.76,0.78
17,Ord_4471,Prod_15,SHP_6228,Cust_1521,13255.9300,0.02,25,4089.27,26.00,0.60
22,Ord_996,Prod_5,SHP_1377,Cust_371,3202.2500,0.09,44,991.26,19.99,0.43
28,Ord_2573,Prod_4,SHP_3527,Cust_931,3594.7435,0.05,38,1016.97,2.50,0.55
40,Ord_5035,Prod_15,SHP_7024,Cust_1710,4917.6900,0.02,42,126.31,30.00,0.71
57,Ord_4546,Prod_1,SHP_6327,Cust_1474,5208.7800,0.05,34,1547.78,7.07,0.59


In [13]:
# An alternative to df.Sales is df['Sales]
# You may want to put the : to indicate that you want all columns
# It is more explicit 
df.loc[df['Sales'] > 3000, :]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.6900,0.00,26,1148.90,2.50,0.59
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.1500,0.08,35,1219.87,26.30,0.38
7,Ord_4725,Prod_4,SHP_6593,Cust_1641,3410.1575,0.10,48,1137.91,0.99,0.55
10,Ord_4743,Prod_2,SHP_6615,Cust_1641,4072.0100,0.01,43,1675.98,0.99,0.56
13,Ord_2207,Prod_11,SHP_3093,Cust_839,3364.2480,0.10,15,-693.23,61.76,0.78
17,Ord_4471,Prod_15,SHP_6228,Cust_1521,13255.9300,0.02,25,4089.27,26.00,0.60
22,Ord_996,Prod_5,SHP_1377,Cust_371,3202.2500,0.09,44,991.26,19.99,0.43
28,Ord_2573,Prod_4,SHP_3527,Cust_931,3594.7435,0.05,38,1016.97,2.50,0.55
40,Ord_5035,Prod_15,SHP_7024,Cust_1710,4917.6900,0.02,42,126.31,30.00,0.71
57,Ord_4546,Prod_1,SHP_6327,Cust_1474,5208.7800,0.05,34,1547.78,7.07,0.59


In [14]:
# We combine multiple conditions using the & operator
# E.g. all orders having 2000 < Sales < 3000 and Profit > 100
df.loc[(df.Sales > 2000) & (df.Sales < 3000) & (df.Profit > 100), :]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.8900,0.09,43,729.34,14.30,0.37
81,Ord_5205,Prod_4,SHP_7274,Cust_1749,2546.5235,0.09,26,210.00,7.69,0.59
109,Ord_139,Prod_17,SHP_186,Cust_45,2671.2100,0.06,14,636.18,15.59,0.36
110,Ord_239,Prod_4,SHP_332,Cust_45,2157.3085,0.00,38,519.25,5.31,0.57
141,Ord_1673,Prod_17,SHP_2314,Cust_498,2027.5500,0.04,14,537.40,13.99,0.37
146,Ord_1649,Prod_4,SHP_2278,Cust_498,2209.5155,0.06,41,458.62,5.31,0.57
176,Ord_2273,Prod_8,SHP_3101,Cust_878,2899.9800,0.10,19,666.01,5.50,0.49
184,Ord_5230,Prod_4,SHP_7304,Cust_1753,2197.4115,0.00,20,305.96,7.69,0.58
186,Ord_5159,Prod_5,SHP_7210,Cust_1753,2213.9200,0.03,20,768.34,13.99,0.65
230,Ord_657,Prod_4,SHP_899,Cust_209,2343.0760,0.06,24,311.64,8.99,0.58


In [15]:
# The 'OR' operator is represented by a | (Note that 'or' doesn't work with pandas)
# E.g. all orders having 2000 < Sales  OR Profit > 100
df.loc[(df.Sales > 2000) | (df.Profit > 100), :]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.6900,0.00,26,1148.90,2.50,0.59
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.8900,0.09,43,729.34,14.30,0.37
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.1500,0.08,35,1219.87,26.30,0.38
7,Ord_4725,Prod_4,SHP_6593,Cust_1641,3410.1575,0.10,48,1137.91,0.99,0.55
10,Ord_4743,Prod_2,SHP_6615,Cust_1641,4072.0100,0.01,43,1675.98,0.99,0.56
13,Ord_2207,Prod_11,SHP_3093,Cust_839,3364.2480,0.10,15,-693.23,61.76,0.78
16,Ord_2282,Prod_9,SHP_3122,Cust_839,443.4600,0.06,30,193.12,1.39,0.38
17,Ord_4471,Prod_15,SHP_6228,Cust_1521,13255.9300,0.02,25,4089.27,26.00,0.60
22,Ord_996,Prod_5,SHP_1377,Cust_371,3202.2500,0.09,44,991.26,19.99,0.43
27,Ord_2405,Prod_9,SHP_3300,Cust_931,1062.6900,0.01,28,401.80,6.66,0.40


In [16]:
# E.g. all orders having 2000 < Sales < 3000 and Profit > 100
# Also, this time, you only need the Cust_id, Sales and Profit columns
df.loc[(df.Sales > 2000) & (df.Sales < 3000) & (df.Profit > 100), ['Cust_id', 'Sales', 'Profit']]

Unnamed: 0,Cust_id,Sales,Profit
3,Cust_1818,2337.8900,729.34
81,Cust_1749,2546.5235,210.00
109,Cust_45,2671.2100,636.18
110,Cust_45,2157.3085,519.25
141,Cust_498,2027.5500,537.40
146,Cust_498,2209.5155,458.62
176,Cust_878,2899.9800,666.01
184,Cust_1753,2197.4115,305.96
186,Cust_1753,2213.9200,768.34
230,Cust_209,2343.0760,311.64


In [17]:
# You can use the == and != operators 
df.loc[(df.Sales == 4233.15), :]
df.loc[(df.Sales != 1000), :]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
0,Ord_5446,Prod_16,SHP_7609,Cust_1818,136.8100,0.01,23,-30.51,3.60,0.56
1,Ord_5406,Prod_13,SHP_7549,Cust_1818,42.2700,0.01,13,4.56,0.93,0.54
2,Ord_5446,Prod_4,SHP_7610,Cust_1818,4701.6900,0.00,26,1148.90,2.50,0.59
3,Ord_5456,Prod_6,SHP_7625,Cust_1818,2337.8900,0.09,43,729.34,14.30,0.37
4,Ord_5485,Prod_17,SHP_7664,Cust_1818,4233.1500,0.08,35,1219.87,26.30,0.38
5,Ord_5446,Prod_6,SHP_7608,Cust_1818,164.0200,0.03,23,-47.64,6.15,0.37
6,Ord_31,Prod_12,SHP_41,Cust_26,14.7600,0.01,5,1.32,0.50,0.36
7,Ord_4725,Prod_4,SHP_6593,Cust_1641,3410.1575,0.10,48,1137.91,0.99,0.55
8,Ord_4725,Prod_13,SHP_6593,Cust_1641,162.0000,0.01,33,45.84,0.71,0.52
9,Ord_4725,Prod_6,SHP_6593,Cust_1641,57.2200,0.07,8,-27.72,6.60,0.37


In [18]:
# You may want to select rows whose column value is in an iterable
# For instance, say a colleague gives you a list of customer_ids from a certain region

customers_in_bangalore = ['Cust_1798', 'Cust_1519', 'Cust_637', 'Cust_851']

# To get all the orders from these customers, use the isin() function
# It returns a boolean, which you can use to select rows
df.loc[df['Cust_id'].isin(customers_in_bangalore), :]

Unnamed: 0,Ord_id,Prod_id,Ship_id,Cust_id,Sales,Discount,Order_Quantity,Profit,Shipping_Cost,Product_Base_Margin
8385,Ord_1833,Prod_3,SHP_2527,Cust_637,611.16,0.04,46,100.22,4.98,0.4
8386,Ord_2324,Prod_7,SHP_3189,Cust_851,121.87,0.07,39,11.32,1.35,0.4
8387,Ord_2220,Prod_3,SHP_3019,Cust_851,41.06,0.04,4,-16.39,6.28,0.35
8388,Ord_4424,Prod_1,SHP_6165,Cust_1519,994.04,0.03,10,-335.06,35.0,
8389,Ord_4444,Prod_13,SHP_6192,Cust_1519,159.41,0.0,44,34.68,0.98,0.52
8390,Ord_5435,Prod_16,SHP_7594,Cust_1798,316.99,0.04,47,-276.54,8.37,0.58
8391,Ord_5435,Prod_4,SHP_7594,Cust_1798,1991.8985,0.07,20,88.36,7.69,0.58
8392,Ord_5384,Prod_9,SHP_7519,Cust_1798,181.5,0.08,43,-6.24,2.5,0.37
8393,Ord_5348,Prod_8,SHP_7470,Cust_1798,356.72,0.07,9,12.61,1.99,0.44
8394,Ord_5353,Prod_4,SHP_7479,Cust_1798,2841.4395,0.08,28,374.63,7.69,0.59
