# Data Indexing and Selection

In [59]:
import pandas as pd
import numpy as np

## Data Selection in Series

### Series as Dictionary

In [6]:
series = pd.Series([0.25, 0.15, 0.50, 0.75], index=['a', 'b', 'c', 'd'])
series

a    0.25
b    0.15
c    0.50
d    0.75
dtype: float64

In [7]:
series['a']

0.25

In [10]:
'a' in series

True

In [11]:
series.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [12]:
list(series.items())

[('a', 0.25), ('b', 0.15), ('c', 0.5), ('d', 0.75)]

In [19]:
series[0] = 1

In [20]:
series

a    1.00
b    0.15
c    0.50
d    0.75
dtype: float64

### Series as One-Dimensional Array

In [21]:
type(series)

pandas.core.series.Series

In [22]:
series

a    1.00
b    0.15
c    0.50
d    0.75
dtype: float64

In [46]:
series[0] # direct indexing

0.5

In [48]:
series[0] = 0.5 # direct assignment

In [25]:
series

a    0.50
b    0.15
c    0.50
d    0.75
dtype: float64

In [49]:
series[:2] # slicing using imoplicit index

a    0.50
b    0.15
dtype: float64

In [50]:
# while slicing with explicit indexes the "stop" index mentioned in the slicing syntax is included in the result

series['a':'c'] # slicing with explicit index

a    0.50
b    0.15
c    0.50
dtype: float64

In [51]:
series[(series<0.9) & (series > 0.3)] # masking using boolean mask

a    0.50
c    0.50
d    0.75
dtype: float64

In [52]:
series[[1,3]] # fancy indexing using implicit index

b    0.15
d    0.75
dtype: float64

In [53]:
series[['a', 'd']] # fancy indexing using explicit index

a    0.50
d    0.75
dtype: float64

In [80]:
# works for numpy
test = np.arange(0,9)
print(ass)

ass[np.array([[0,2],[5,9]])]

[0 1 2 3 4 5 6 7 8 9]


array([[0, 2],
       [5, 9]])

In [90]:
# fancy indexing does not work directly with series when the dimension of index array does not match the dimension of the sries

%xmode minimal
series[np.array([[0,2],[1,0]])]

Exception reporting mode: Minimal


ValueError: Wrong number of dimensions. values.ndim > ndim [2 > 1]

In [92]:
# to make the fancy indexing work we need to access the underlying numpy array using the values attribute of the series object

series.values[np.array([[0,2],[1,0]])]

array([[0.5 , 0.5 ],
       [0.15, 0.5 ]])

In [102]:
# after we have accessed the underlying numpy array using the value attribute we can all the valid operationi on that array without modifying the actual
# series as the numpy array operations creats view of the original array via copy

series.values[:, np.newaxis].T

array([[0.5 , 0.15, 0.5 , 0.75]])

In [101]:
series

a    0.50
b    0.15
c    0.50
d    0.75
dtype: float64

### Indexers: loc and iloc

In [103]:
# if the explicit indexed of the series is numbers then they might create confusion with the implicit indexes

new_series = pd.Series(["Koushik", "Raymond", "Francis", "Thomas"], index=[2,1,0,5])
new_series

2    Koushik
1    Raymond
0    Francis
5     Thomas
dtype: object

In [104]:
new_series[0]

'Francis'

In [106]:
new_series[1:3]

1    Raymond
0    Francis
dtype: object

In [109]:
# the loc indexer attribute allows indexing and slicing that always references the explicit index

new_series.loc[2:0]

2    Koushik
1    Raymond
0    Francis
dtype: object

In [110]:
new_series.loc[0]

'Francis'

In [113]:
new_series.loc[:2]

2    Koushik
dtype: object

In [114]:
# The iloc attribute allows indexing and slicing that always references the implicit Python-style index

new_series.iloc[0]

'Koushik'

In [115]:
new_series.iloc[0:2]

2    Koushik
1    Raymond
dtype: object

In [122]:
new_series.iloc[0:6]

2    Koushik
1    Raymond
0    Francis
5     Thomas
dtype: object

In [154]:
# all actions like this, that treats the series as a 1-D array are valid but not always useful like this Transpose operation, 
# this transpose is although useful for the the 2-D counterpart of Series, i.e. DataFrame
new_series.T

2    Koushik
1    Raymond
0    Francis
5     Thomas
dtype: object

In [155]:
new_series.sum()

'KoushikRaymondFrancisThomas'

In [160]:
list(title(new_series.values()))

NameError: name 'title' is not defined

## Data Selection in DataFrames

### DataFrame as Dictionary

In [142]:
area = pd.Series({'California': 423967, 'Texas': 695662, 'Florida': 170312, 'New York': 141297, 'Pennsylvania': 119280})
pop = pd.Series({'California': 39538223, 'Texas': 29145505, 'Florida': 21538187, 'New York': 20201249, 'Pennsylvania': 13002700})

data = pd.DataFrame({"area":area, "pop":pop})
data

Unnamed: 0,area,pop
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187
New York,141297,20201249
Pennsylvania,119280,13002700


In [143]:
data["area"]

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [144]:
data.keys()

Index(['area', 'pop'], dtype='object')

In [145]:
data.index

Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')

In [146]:
data["area"].keys() # I don't know how it worked

Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')

In [147]:
data.area

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [148]:
# the dot-notation for accessing the values of a dataframe will work only if the keys are strings and they don't match any of the DataFrame 's own
# predefined functions of attributes

data.pop

<bound method DataFrame.pop of                 area       pop
California    423967  39538223
Texas         695662  29145505
Florida       170312  21538187
New York      141297  20201249
Pennsylvania  119280  13002700>

In [150]:
data["density"] = data["pop"] / data["area"]
data

Unnamed: 0,area,pop,density
California,423967,39538223,93.257784
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


### DataFrame as Two-Dimensional Array

In [162]:
data

Unnamed: 0,area,pop,density
California,423967,39538223,93.257784
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


In [151]:
data.values

array([[4.23967000e+05, 3.95382230e+07, 9.32577842e+01],
       [6.95662000e+05, 2.91455050e+07, 4.18960717e+01],
       [1.70312000e+05, 2.15381870e+07, 1.26463121e+02],
       [1.41297000e+05, 2.02012490e+07, 1.42970120e+02],
       [1.19280000e+05, 1.30027000e+07, 1.09009893e+02]])

In [161]:
data.T

Unnamed: 0,California,Texas,Florida,New York,Pennsylvania
area,423967.0,695662.0,170312.0,141297.0,119280.0
pop,39538220.0,29145500.0,21538190.0,20201250.0,13002700.0
density,93.25778,41.89607,126.4631,142.9701,109.0099


In [163]:
data[0]

KeyError: 0

In [164]:
data.values[0]

array([4.23967000e+05, 3.95382230e+07, 9.32577842e+01])

In [166]:
data

Unnamed: 0,area,pop,density
California,423967,39538223,93.257784
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


In [165]:
data["area"]

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [168]:
data["Texas":"New York"]

Unnamed: 0,area,pop,density
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012


In [171]:
# when a single index is passed to a DataFrame object it is treated as a column index

data[1]

KeyError: 1

In [172]:
# when a sice is passed whether it will be interpreted as column index or not depends on whethe the slice passed consists of integers or not
data[1:3]

Unnamed: 0,area,pop,density
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121


In [None]:
# to save us this confusion the loc and iloc indexer attributes are available 

### Indexers: loc and iloc

In [176]:
# loc refernces the array using the tradintional python-style indexing
data.iloc[0]

area       4.239670e+05
pop        3.953822e+07
density    9.325778e+01
Name: California, dtype: float64

In [173]:
data.iloc[1:3]

Unnamed: 0,area,pop,density
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121


In [175]:
data.loc["Texas":"New York"]

Unnamed: 0,area,pop,density
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012


In [177]:
data.loc[data.density>100]

Unnamed: 0,area,pop,density
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


In [187]:
data.loc[data["pop"] < (2.5 * (10 ** 7)), ["pop", "density"]]

Unnamed: 0,pop,density
Florida,21538187,126.463121
New York,20201249,142.97012
Pennsylvania,13002700,109.009893


In [188]:
# Any of these indexing conventions may also be used to set or modify values

data

Unnamed: 0,area,pop,density
California,423967,39538223,93.257784
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


In [189]:
data.iloc[0, 2] = 90

In [190]:
data

Unnamed: 0,area,pop,density
California,423967,39538223,90.0
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


In [194]:
data.loc["California", "density"] = data.loc["California", "pop"] / data.loc["California", "area"]

In [195]:
data

Unnamed: 0,area,pop,density
California,423967,39538223,93.257784
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


In [199]:
data["density"]["California"]

93.25778421433743

In [200]:
data.density.California

93.25778421433743