In [4]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__

'0.23.4'

In [5]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])

In [6]:
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [9]:
# Series wraps both a sequence of values and a sequence of indices, 
# which we can access with the values and index attributes

In [11]:
# numpy arr
data.values

array([ 0.25,  0.5 ,  0.75,  1.  ])

In [10]:
# array-like object of type pd.Index
data.index

RangeIndex(start=0, stop=4, step=1)

In [13]:
data[1]

0.5

In [15]:
data[1:3]

1    0.50
2    0.75
dtype: float64

### Series as generalized NumPy array

In [17]:
# while the NumPy array has an implicitly defined integer index used to access the values, 
# the Pandas Series has an explicitly defined index associated with the values

In [18]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [24]:
# can even use noncontiguous or nonsequential indices:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

### Series as specialized dictionary

In [26]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [27]:
# slicing:
population['California':'Illinois']

California    38332521
Florida       19552860
Illinois      12882135
dtype: int64

### Constructing Series objects

In [31]:
# from list
pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [32]:
# scalar
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [34]:
# dict
pd.Series({2:'a', 1:'b', 3:'c'})

1    b
2    a
3    c
dtype: object

In [35]:
# explicity different index:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

3    c
2    a
dtype: object

### DataFrame as a generalized NumPy array

In [37]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)

states = pd.DataFrame({'population': population, 'area': area})
states

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [38]:
states.index, states.columns

(Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object'),
 Index(['area', 'population'], dtype='object'))

### DataFrame as specialized dictionary

In [43]:
# potential point of confusion here: in a two-dimensional NumPy array, 
# data[0] will return the first row. For a DataFrame, data['col0'] 
# will return the first column.
states['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

### Constructing DataFrame objects

In [47]:
# single Series object:
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Florida,19552860
Illinois,12882135
New York,19651127
Texas,26448193


In [48]:
# list of dicts:
data = [{'a': i, 'b': 2 * i} for i in range(3)] 
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [49]:
# even with missed keys:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [50]:
# dictionary of Series objects
pd.DataFrame({'population': population, 'area': area})

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [51]:
# a two-dimensional NumPy array:
pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.827588,0.176924
b,0.28433,0.406273
c,0.720172,0.037241


In [53]:
# a NumPy structured array:
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


# The Pandas Index Object

In [55]:
ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [58]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [60]:
# Index as immutable array
ind[1] = 0

TypeError: Index does not support mutable operations

In [61]:
# Index as ordered set
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

In [62]:
# intersection
indA & indB 

Int64Index([3, 5, 7], dtype='int64')

In [63]:
# union
indA | indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [64]:
# symmetric difference
indA ^ indB

Int64Index([1, 2, 9, 11], dtype='int64')

In [67]:
# may also be accessed via object methods—for example:
indA.symmetric_difference(indB)

Int64Index([1, 2, 9, 11], dtype='int64')

# Data Indexing and Selection

### Data Selection in Series

In [70]:
# like dict:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data['b']

0.5

In [71]:
'b' in data

True

In [72]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [73]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [75]:
# and extend a dictionary by assigning to a new key:
data['qwe'] = 1.25
data

a      0.25
b      0.50
c      0.75
d      1.00
qwe    1.25
dtype: float64

In [76]:
# like one-dimensional array:

In [94]:
# slicing by explicit index 
print(data['b':'c'])
# slicing by implicit integer index 
print(data[1:3])

b    0.50
c    0.75
dtype: float64
b    0.50
c    0.75
dtype: float64


In [81]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [92]:
# fancy indexing 
# if no index - data.reindex(['a', 'e'])
data.loc[['a', 'qwe']]

a      0.25
qwe    1.25
dtype: float64

# slicing with an explicit index (i.e., data['a':'c']), the final index is included in the slice, while when you’re slicing with an implicit index (i.e., data[0:2]), the final index is excluded from the slice

In [106]:
# Похоже, что лучше так не делать:


In [108]:
data2 = pd.Series([0.25, 0.5, 0.75, 1.0], index=[1, 2, 3, 4])
# неявные индесы:
data2[1:3]

2    0.50
3    0.75
dtype: float64

In [109]:
# используются явные индексы:
data2[[1,2]]

1    0.25
2    0.50
dtype: float64

### Indexers: loc, iloc, and ix

Because of this potential confusion in the case of integer indexes, Pandas provides some special indexer attributes that explicitly expose certain indexing schemes

In [113]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])

In [114]:
# explicit index:
data.loc[1]

'a'

In [115]:
data.loc[1:3]

1    a
3    b
dtype: object

In [116]:
# implicit:
data.iloc[1:3]

3    b
5    c
dtype: object

In [118]:
data.ix

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


<pandas.core.indexing._IXIndexer at 0x11591d598>

## Data Selection in DataFrame

In [120]:
area = pd.Series({'California': 423967, 'Texas': 695662, 
                  'New York': 141297, 'Florida': 170312, 'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [128]:
# as a dict:

In [121]:
data.area is data['area']

True

In [126]:
# if the column names conflict with methods of the DataFrame, 
# this attribute-style access is not possible:
data.pop is data['pop']

False

In [127]:
# use data['pop'] = z rather than data.pop = z:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [129]:
# as an array:

In [130]:
data.values

array([[  4.23967000e+05,   3.83325210e+07,   9.04139261e+01],
       [  1.70312000e+05,   1.95528600e+07,   1.14806121e+02],
       [  1.49995000e+05,   1.28821350e+07,   8.58837628e+01],
       [  1.41297000e+05,   1.96511270e+07,   1.39076746e+02],
       [  6.95662000e+05,   2.64481930e+07,   3.80187404e+01]])

In [131]:
# we can do many familiar array-like observations
data.T

Unnamed: 0,California,Florida,Illinois,New York,Texas
area,423967.0,170312.0,149995.0,141297.0,695662.0
pop,38332520.0,19552860.0,12882140.0,19651130.0,26448190.0
density,90.41393,114.8061,85.88376,139.0767,38.01874


In [132]:
data.values[0]

array([  4.23967000e+05,   3.83325210e+07,   9.04139261e+01])

In [133]:
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [135]:
# implicit
data.iloc[:3, :2]

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [136]:
# explicit
data.loc[:'Illinois', :'pop']

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [140]:
# combine loc with masking and fancy indexing
data.loc[data.density > 100, ['pop', 'density']]

Unnamed: 0,pop,density
Florida,19552860,114.806121
New York,19651127,139.076746


In [141]:
# and for modify too:
data.iloc[0, 2] = 90
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.0
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [142]:
# + just:
# data['Florida':'Illinois']
# data[1:3]
data[data.density > 100]

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
New York,141297,19651127,139.076746


# Operating on Data in Pandas

In [144]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10, 4))

In [145]:
df = pd.DataFrame(rng.randint(0, 10, (3, 4)), columns=['A', 'B', 'C', 'D'])

In [146]:
ser

0    6
1    3
2    7
3    4
dtype: int64

In [147]:
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [148]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [149]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


### UFuncs: Index Alignment
The resulting array contains the union of indices of the two input arrays, which we could determine using standard Python set arithmetic on these indices:

In [152]:
area = pd.Series(
    {'Alaska': 1723337, 'Texas': 695662, 'California': 423967}, 
    name='area')
population = pd.Series(
    {'California': 38332521, 'Texas': 26448193, 'New York': 19651127}, 
    name='population')

In [153]:
population / area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [159]:
# NaN, or “Not a Number,” which is how Pandas marks missing data

In [157]:
# union of indexes:
area.index.union(population.index)
area.index | population.index

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [158]:
area.index & population.index

Index(['California', 'Texas'], dtype='object')

In [160]:
A = pd.Series([2, 4, 6], index=[0, 1, 2]) 
B = pd.Series([1, 3, 5], index=[1, 2, 3]) 
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [161]:
# можно сложить их так, чтобы не было Nan:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

A similar type of alignment takes place for both columns and indices when you are performing operations on DataFrames:

In [163]:
A = pd.DataFrame(rng.randint(0, 20, (2, 2)), columns=list('AB'))
B = pd.DataFrame(rng.randint(0, 10, (3, 3)), columns=list('BAC'))
display(A)
display(B)

Unnamed: 0,A,B
0,19,2
1,4,18


Unnamed: 0,B,A,C
0,6,4,8
1,6,1,3
2,8,1,9


In [164]:
A + B

Unnamed: 0,A,B,C
0,23.0,8.0,
1,5.0,24.0,
2,,,


pass any desired fill_value to be used in place of missing entries. Here we’ll fill with the mean of all values in A (which we compute by first stacking the rows of A):

In [176]:
A.stack()

0  A    19
   B     2
1  A     4
   B    18
dtype: int64

In [177]:
A.stack().mean()

10.75

In [179]:
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,23.0,8.0,18.75
1,5.0,24.0,13.75
2,11.75,18.75,19.75


In [182]:
%%html
<img src="df_ops.png" width="500px" height="100px">

### Ufuncs: Operations Between DataFrame and Series

In [190]:
# for numpy
A = rng.randint(10, size=(3, 4))
display(A)
A - A[0]

array([[7, 6, 8, 7],
       [4, 1, 4, 7],
       [9, 8, 8, 0]])

array([[ 0,  0,  0,  0],
       [-3, -5, -4,  0],
       [ 2,  2,  0, -7]])

In [195]:
# in pandas the same row-wise rule:
df = pd.DataFrame(A, columns=list('QRST'))
display(df)
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,7,6,8,7
1,4,1,4,7
2,9,8,8,0


Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-3,-5,-4,0
2,2,2,0,-7


In [201]:
# for col-wise:
df.subtract(df['R'], axis=0)

Unnamed: 0,Q,R,S,T
0,1,0,2,1
1,3,0,3,6
2,1,0,0,-8


# Handling Missing Data

In [208]:
# null, NaN, or NA
# two strategies: using a mask that globally indicates missing values, 
# or choosing a sentinel value that indicates a missing entry.

# Pandas chose to use sentinels for missing data, 
# and further chose to use two already-existing Python null values: 
# the special floating- point NaN value, and the Python None object.

In [211]:
# None: Pythonic missing data
# Because None is a Python object, it cannot be used in any arbitrary NumPy/Pandas array,
# but only in arrays with data type 'object' (i.e., arrays of Python objects)
vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

In [213]:
# overhead!
for dtype in ['object', 'int']: 
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum() 
    print()

dtype = object
75 ms ± 795 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype = int
904 µs ± 5.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

