# index columns

In [1]:
import pandas as pd
import numpy as np

stock = pd.DataFrame({
    'item_no': pd.Series([1, 2, 2, 4, 5, 6, 7, 8, 9, 10], dtype='Int64'),
    'cost_class': pd.Series(['1st', '2nd', '3rd', '4th', '4th', '3rd', '2nd', np.nan, '1st', '3rd'], dtype='string'),
    'cost': pd.Series([10.99, np.nan, 2.99, np.nan, 2.99, 2.45, 5.99, 5.99, 3.00, None], dtype='float64'),
    'stock_code': pd.Series(['a', 'a', 'c', 'b', 'a', 'b', np.nan, np.nan, 'a', 'c'], dtype='string'),
    'priority_code': pd.Series([np.nan, None, 'a', 'b', None, 'a', 'e', None, 'a', 'd'], dtype='string'),
    'tax_rate': pd.Series([0, 0, 20, 20, 20, 0, 20, 20, 5, 20])
})

stock

Unnamed: 0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
0,1,1st,10.99,a,,0
1,2,2nd,,a,,0
2,2,3rd,2.99,c,a,20
3,4,4th,,b,b,20
4,5,4th,2.99,a,,20
5,6,3rd,2.45,b,a,0
6,7,2nd,5.99,,e,20
7,8,,5.99,,,20
8,9,1st,3.0,a,a,5
9,10,3rd,,c,d,20


In [2]:
stock.describe(include="all")

Unnamed: 0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
count,10.0,9,7.0,8,6,10.0
unique,,4,,3,4,
top,,3rd,,a,a,
freq,,3,,4,3,
mean,5.4,,4.914286,,,12.5
std,3.134042,,3.065169,,,9.78945
min,1.0,,2.45,,,0.0
25%,2.5,,2.99,,,1.25
50%,5.5,,3.0,,,20.0
75%,7.75,,5.99,,,20.0


In [4]:
stock.axes #info on rows and columns

[RangeIndex(start=0, stop=10, step=1),
 Index(['item_no', 'cost_class', 'cost', 'stock_code', 'priority_code',
        'tax_rate'],
       dtype='object')]

In [5]:
stock.loc[0:5, :] #filtering using index

Unnamed: 0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
0,1,1st,10.99,a,,0
1,2,2nd,,a,,0
2,2,3rd,2.99,c,a,20
3,4,4th,,b,b,20
4,5,4th,2.99,a,,20
5,6,3rd,2.45,b,a,0


## setting a new index

In [6]:
stock.index

RangeIndex(start=0, stop=10, step=1)

In [7]:
stock.index = pd.RangeIndex(13, 23, name="index") #one re-indexing method
stock

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13,1,1st,10.99,a,,0
14,2,2nd,,a,,0
15,2,3rd,2.99,c,a,20
16,4,4th,,b,b,20
17,5,4th,2.99,a,,20
18,6,3rd,2.45,b,a,0
19,7,2nd,5.99,,e,20
20,8,,5.99,,,20
21,9,1st,3.0,a,a,5
22,10,3rd,,c,d,20


In [9]:
stock.loc[13:15, :]

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13,1,1st,10.99,a,,0
14,2,2nd,,a,,0
15,2,3rd,2.99,c,a,20


In [13]:
#indexing with a column
stock['item_code'] = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] #creating column for index
stock.set_index('item_code', inplace=True)

In [14]:
stock.loc[['f', 'i'], :]

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
f,6,3rd,2.45,b,a,0
i,9,1st,3.0,a,a,5


In [15]:
stock.iloc[[5, 8]] #for numerical indices

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
f,6,3rd,2.45,b,a,0
i,9,1st,3.0,a,a,5


In [20]:
new_series = pd.Series(['pen', 'pencil', 'calculator', 'ruler'], index = ['a', 'c', 'e', 'i'])

In [21]:
new_series

a           pen
c        pencil
e    calculator
i         ruler
dtype: object

In [22]:
stock['item_type'] = new_series #joining two dataframes on index

In [23]:
stock

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a,1,1st,10.99,a,,0,pen
b,2,2nd,,a,,0,
c,2,3rd,2.99,c,a,20,pencil
d,4,4th,,b,b,20,
e,5,4th,2.99,a,,20,calculator
f,6,3rd,2.45,b,a,0,
g,7,2nd,5.99,,e,20,
h,8,,5.99,,,20,
i,9,1st,3.0,a,a,5,ruler
j,10,3rd,,c,d,20,


## more on filtering

In [26]:
#negtaing conditions with tilda
stock.loc[~(stock.item_type.isin(['pen', 'pencil'])), :]

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
b,2,2nd,,a,,0,
d,4,4th,,b,b,20,
e,5,4th,2.99,a,,20,calculator
f,6,3rd,2.45,b,a,0,
g,7,2nd,5.99,,e,20,
h,8,,5.99,,,20,
i,9,1st,3.0,a,a,5,ruler
j,10,3rd,,c,d,20,


In [27]:
# individual columns with na
stock.loc[stock.cost.isna()]

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
b,2,2nd,,a,,0,
d,4,4th,,b,b,20,
j,10,3rd,,c,d,20,


In [28]:
# across all columns for na
stock.isna()

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
a,False,False,False,False,True,False,False
b,False,False,True,False,True,False,True
c,False,False,False,False,False,False,False
d,False,False,True,False,False,False,True
e,False,False,False,False,True,False,False
f,False,False,False,False,False,False,True
g,False,False,False,True,False,False,True
h,False,True,False,True,True,False,True
i,False,False,False,False,False,False,False
j,False,False,True,False,False,False,True


In [29]:
#data mask using any
stock.isna().any(axis='columns')

item_code
a     True
b     True
c    False
d     True
e     True
f     True
g     True
h     True
i    False
j     True
dtype: bool

In [31]:
#omit rows with nas
stock.loc[~stock.isna().any(axis='columns')]

Unnamed: 0_level_0,item_no,cost_class,cost,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
c,2,3rd,2.99,c,a,20,pencil
i,9,1st,3.0,a,a,5,ruler


## bonus selection techniques

In [35]:
stock.columns.str.startswith('c')

array([False,  True,  True, False, False, False, False])

In [36]:
stock.columns

Index(['item_no', 'cost_class', 'cost', 'stock_code', 'priority_code',
       'tax_rate', 'item_type'],
      dtype='object')

In [38]:
#select all rows and columns that startw ith c
stock.loc[:, stock.columns.str.startswith('c')]

Unnamed: 0_level_0,cost_class,cost
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1st,10.99
b,2nd,
c,3rd,2.99
d,4th,
e,4th,2.99
f,3rd,2.45
g,2nd,5.99
h,,5.99
i,1st,3.0
j,3rd,


In [40]:
#using regex
stock.loc[:, stock.columns.str.contains(r'^.o')]

Unnamed: 0_level_0,cost_class,cost
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1st,10.99
b,2nd,
c,3rd,2.99
d,4th,
e,4th,2.99
f,3rd,2.45
g,2nd,5.99
h,,5.99
i,1st,3.0
j,3rd,


## dropping columns

In [42]:
#overwriting stock with dropped cols versions
stock.drop(columns=['cost', 'cost_class'], inplace=True)

In [43]:
stock

Unnamed: 0_level_0,item_no,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a,1,a,,0,pen
b,2,a,,0,
c,2,c,a,20,pencil
d,4,b,b,20,
e,5,a,,20,calculator
f,6,b,a,0,
g,7,,e,20,
h,8,,,20,
i,9,a,a,5,ruler
j,10,c,d,20,


In [44]:
#dropping columns with string
item_cols = stock.columns[stock.columns.str.startswith('i')]
stock.drop(columns=item_cols)

Unnamed: 0_level_0,stock_code,priority_code,tax_rate
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,a,,0
b,a,,0
c,c,a,20
d,b,b,20
e,a,,20
f,b,a,0
g,,e,20
h,,,20
i,a,a,5
j,c,d,20


In [47]:
#drop rows a and i using index labels 
stock.drop(labels=['a', 'i'])

Unnamed: 0_level_0,item_no,stock_code,priority_code,tax_rate,item_type
item_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
b,2,a,,0,
c,2,c,a,20,pencil
d,4,b,b,20,
e,5,a,,20,calculator
f,6,b,a,0,
g,7,,e,20,
h,8,,,20,
j,10,c,d,20,
