#  Python Data Science Handbook

 ## <font color='green'> The Pandas Series Object </font>

In [39]:
import numpy as np
import pandas as pd
from IPython.display import display

In [8]:
#Series as generalized NumPy array
pd_s = pd.Series([0.25, 0.5, 0.75, 1.0],index=['a','b','c','d'])
pd_s

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [9]:
print pd_s.values
print pd_s.index
type(pd_s.values)

[ 0.25  0.5   0.75  1.  ]
Index([u'a', u'b', u'c', u'd'], dtype='object')


numpy.ndarray

In [20]:
#Series as specialized dictionary
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
#Print the sorted keys.
population

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [21]:
print population['California'], '\n'
print population['California':'New York']


38332521 

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
dtype: int64


In [27]:
#Constructing Series objects

#a list or NumPy array
s1 = pd.Series([2,3,4])
print s1
# a scalar
s2 = pd.Series(5,index=[1,2,3])
print s2
# a dictionary
s3 = pd.Series({2:'a', 1:'b', 3:'c'})
print s3
s4 = pd.Series({2:'a', 1:'b', 3:'c'},index=[3,2])
print s4

0    2
1    3
2    4
dtype: int64
1    5
2    5
3    5
dtype: int64
1    b
2    a
3    c
dtype: object
3    c
2    a
dtype: object


## The Pandas DataFrame Object



### DataFrame as a generalized NumPy array
- a Series is an analog of a one-dimensional array with flexible indices
- a DataFrame is an analog of a two-dimensional array with both flexible row indices and flexible column names
- DataFrame as a sequence of aligned Series objects. Here, by "aligned" we mean that they share the same index.

In [41]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
display(area)

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64

In [44]:
states = pd.DataFrame({'area':area,'population':population})
display(states)
display(states.index)
display(states.columns)

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


Index([u'California', u'Florida', u'Illinois', u'New York', u'Texas'], dtype='object')

Index([u'area', u'population'], dtype='object')

- we can also think of a DataFrame as a specialization of a dictionary. 
- a DataFrame maps a column name to a Series of column data.

-  Notice the potential point of confusion here: 
    - <font color='red'>a two-dimesnional NumPy array, data[0] will return the first row. 
    - a DataFrame, data['col0'] will return the first column. 
    - Because of this, it is probably better to think about DataFrames as generalized dictionaries rather than generalized arrays </font>

In [45]:
states['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [47]:
#From a list of dicts
pd_list = pd.DataFrame({'a':i,'b':2*i} for i in range(3))
pd_list

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [49]:
#From a two-dimensional NumPy array
pd.DataFrame(np.random.rand(3,2),columns=['foo','bar'],index=[0,1,2])

Unnamed: 0,foo,bar
0,0.488388,0.54816
1,0.400854,0.257152
2,0.253658,0.680443


## The Pandas Index Object


- an immutable array or as an ordered set (technically a multi-set, as Index objects may contain repeated values).

In [60]:
pd_ind = pd.Index([1,3,5,7,9])

display(pd_ind[1])
display(pd_ind.size, pd_ind.shape, pd_ind.ndim, pd_ind.dtype)


3

5

(5,)

1

dtype('int64')

In [55]:
#Index does not support mutable operations
pd_ind[0] = 1

TypeError: Index does not support mutable operations

### Index as ordered set

In [62]:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

In [68]:
display(
indA & indB,  # intersection
indA | indB,  # union
indA ^ indB)  # symmetric difference

Int64Index([3, 5, 7], dtype='int64')

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

Int64Index([1, 2, 9, 11], dtype='int64')

## <font color='green'> Data Indexing and Selection </font>
