#  Python Data Science Handbook

 ## <font color='green'> The Pandas Series Object </font>

In [3]:
import numpy as np
import pandas as pd
from IPython.display import display

In [2]:
#Series as generalized NumPy array
pd_s = pd.Series([0.25, 0.5, 0.75, 1.0],index=['a','b','c','d'])
pd_s

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [3]:
print pd_s.values
print pd_s.index
type(pd_s.values)

[ 0.25  0.5   0.75  1.  ]
Index([u'a', u'b', u'c', u'd'], dtype='object')


numpy.ndarray

In [4]:
#Series as specialized dictionary
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
#Print the sorted keys.
population

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [5]:
print population['California'], '\n'
print population['California':'New York']


38332521 

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
dtype: int64


In [6]:
#Constructing Series objects

#a list or NumPy array
s1 = pd.Series([2,3,4])
print s1
# a scalar
s2 = pd.Series(5,index=[1,2,3])
print s2
# a dictionary
s3 = pd.Series({2:'a', 1:'b', 3:'c'})
print s3
s4 = pd.Series({2:'a', 1:'b', 3:'c'},index=[3,2])
print s4

0    2
1    3
2    4
dtype: int64
1    5
2    5
3    5
dtype: int64
1    b
2    a
3    c
dtype: object
3    c
2    a
dtype: object


## The Pandas DataFrame Object



### DataFrame as a generalized NumPy array
- a Series is an analog of a one-dimensional array with flexible indices
- a DataFrame is an analog of a two-dimensional array with both flexible row indices and flexible column names
- DataFrame as a sequence of aligned Series objects. Here, by "aligned" we mean that they share the same index.

In [7]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
display(area)

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64

In [8]:
states = pd.DataFrame({'area':area,'population':population})
display(states)
display(states.index)
display(states.columns)

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


Index([u'California', u'Florida', u'Illinois', u'New York', u'Texas'], dtype='object')

Index([u'area', u'population'], dtype='object')

- we can also think of a DataFrame as a specialization of a dictionary. 
- a DataFrame maps a column name to a Series of column data.

-  Notice the potential point of confusion here: 
    - <font color='red'>a two-dimesnional NumPy array, data[0] will return the first row. 
    - a DataFrame, data['col0'] will return the first column. 
    - Because of this, it is probably better to think about DataFrames as generalized dictionaries rather than generalized arrays </font>

In [9]:
states['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [10]:
#From a list of dicts
pd_list = pd.DataFrame({'a':i,'b':2*i} for i in range(3))
pd_list

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [11]:
#From a two-dimensional NumPy array
pd.DataFrame(np.random.rand(3,2),columns=['foo','bar'],index=[0,1,2])

Unnamed: 0,foo,bar
0,0.697839,0.338371
1,0.193964,0.051988
2,0.321311,0.592212


## The Pandas Index Object


- an immutable array or as an ordered set (technically a multi-set, as Index objects may contain repeated values).

In [12]:
pd_ind = pd.Index([1,3,5,7,9])

display(pd_ind[1])
display(pd_ind.size, pd_ind.shape, pd_ind.ndim, pd_ind.dtype)


3

5

(5L,)

1

dtype('int64')

In [13]:
#Index does not support mutable operations
pd_ind[0] = 1

TypeError: Index does not support mutable operations

### Index as ordered set

In [14]:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

In [15]:
display(
indA & indB,  # intersection
indA | indB,  # union
indA ^ indB)  # symmetric difference

Int64Index([3, 5, 7], dtype='int64')

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

Int64Index([1, 2, 9, 11], dtype='int64')

## <font color='green'> Data Indexing and Selection </font>


- indexing (e.g., arr[2, 1]), slicing (e.g., arr[:, 1:5])
- masking (e.g., arr[arr > 0]), fancy indexing (e.g., arr[0, [1, 5]])
- combinations thereof (e.g., arr[:, [1, 5]]).

### Indexers: loc, iloc, and ix

In [18]:
data = pd.Series(['a','b','c'], index=[1,2,3])
data

1    a
2    b
3    c
dtype: object

In [22]:
display(
    data[1],   #explicit index when indexing
    data[1:3]) #implicit index when slicing

'a'

2    b
3    c
dtype: object

- Because of this potential confusion in the case of integer indexes, Pandas provides some special indexer attributes that explicitly expose certain indexing schemes. 
-  prevent subtle bugs due to the mixed indexing/slicing convention.

In [24]:
display(
    data.loc[1],
    data.loc[1:3])

'a'

1    a
2    b
3    c
dtype: object

In [25]:
display(
    data.iloc[1], #the implicit Python-style index
    data.iloc[1:3])

'b'

2    b
3    c
dtype: object

### Data Selection in DataFrame


In [26]:
#DataFrame is o dictionary of related Series
display(states)

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [28]:
states['area'] is states.area

True

In [31]:
states['density'] = states['population']/states['area']
display(states)

Unnamed: 0,area,population,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [34]:
#View as tow-dim array
states.values[0]

array([  4.23967000e+05,   3.83325210e+07,   9.04139261e+01])

In [33]:
#Transpose: swap cols and rows
states.T

Unnamed: 0,California,Florida,Illinois,New York,Texas
area,423967.0,170312.0,149995.0,141297.0,695662.0
population,38332520.0,19552860.0,12882140.0,19651130.0,26448190.0
density,90.41393,114.8061,85.88376,139.0767,38.01874


In [35]:
states['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [39]:
states.iloc[:3,:2]

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [41]:
states.loc[:'Florida',:'population']

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860


In [42]:
states.ix[:3,:'population']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  """Entry point for launching an IPython kernel.


Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [43]:
states.loc[states.density > 100, ['population','density']] 
#combine masking and fancy indexing as in the following:

Unnamed: 0,population,density
Florida,19552860,114.806121
New York,19651127,139.076746


### Additional indexing conventions


In [46]:
display(
states['Florida':'Illinois'],
states[1:3])

Unnamed: 0,area,population,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


Unnamed: 0,area,population,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [50]:
states[states.density > 100]

Unnamed: 0,area,population,density
Florida,170312,19552860,114.806121
New York,141297,19651127,139.076746


## <font color='green'> Operating on Data in Pandas </font>


In [54]:
df = pd.DataFrame(np.random.randint(0,10,(3,4)),columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,7,3,3,0
1,2,2,1,6
2,2,9,4,1


In [55]:
#a NumPy ufunc on objects, the result will be another Pandas object with the indices preserved:
np.exp(df)

Unnamed: 0,A,B,C,D
0,1096.633158,20.085537,20.085537,1.0
1,7.389056,7.389056,2.718282,403.428793
2,7.389056,8103.083928,54.59815,2.718282


In [57]:
#UFuncs: Index Alignment
A = pd.Series([2,4,6],index=[0,1,2])
B = pd.Series([1,3,5],index=[1,2,3])
A+B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [58]:
A.add(B,fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [60]:
A = pd.DataFrame(np.random.randint(0,10,(2,2)))
B = pd.DataFrame(np.random.randint(0,10,(3,3)))
A+B

Unnamed: 0,0,1,2
0,14.0,11.0,
1,8.0,10.0,
2,,,


In [61]:
fill = A.stack().mean()
A.add(B,fill_value=fill)

Unnamed: 0,0,1,2
0,14.0,11.0,4.25
1,8.0,10.0,9.25
2,8.25,6.25,9.25


In [65]:
A.stack()

0  0    6
   1    3
1  0    2
   1    6
dtype: int32

In [66]:
A

Unnamed: 0,0,1
0,6,3
1,2,6


```python
Python Operator Pandas Method(s)
+	add()
-	sub(), subtract()
*	mul(), multiply()
/	truediv(), div(), divide()
//	floordiv()
%	mod()
**	pow()
```

## <font color='green'> Handling Missing Data </font>


#### Missing Data in Pandas

In [4]:
for dtype in ['object', 'int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()

('dtype =', 'object')
10 loops, best of 3: 47.9 ms per loop
('dtype =', 'int')
100 loops, best of 3: 2.38 ms per loop



#### Typeclass	Conversion When Storing NAs	NA Sentinel Value
- floating	-> No change ->	np.nan
- object	-> No change -> None or np.nan
- integer	-> Cast to float64 -> np.nan
- boolean	->Cast to object -> None or np.nan

Keep in mind that in Pandas, string data is always stored with an object dtype.

### Operating on Null Values

- <font color='green'>isnull()</font>: Generate a boolean mask indicating missing values
- <font color='green'>notnull()</font>: Opposite of isnull()
- <font color='green'>dropna()</font>: Return a filtered version of the data
- <font color='green'>fillna()</font>: Return a copy of the data with missing values filled or imputed


### Pandas MultiIndex

In [34]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956, 18976457, 19378102, 20851820, 25145561]

#pd.MultiIndex.from_tuples(index)
index = pd.MultiIndex.from_tuples(index)
pop = pd.Series(populations,index=index)
pop = pop.reindex(index)
pop

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [26]:
pop[:,2010]

California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [39]:
#pd.MultiIndex.from_arrays(arrays)
arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
idx = pd.MultiIndex.from_arrays(arrays)
test = pd.Series([0,0,0,0],index=idx)
test

1  red     0
   blue    0
2  red     0
   blue    0
dtype: int64

### MultiIndex as extra dimension

In [27]:
pop_df = pop.unstack()
pop_df

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561


In [28]:
pop_s = pop_df.stack()
pop_s

California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64