In [None]:
"""
learning pandas (http://pandas.pydata.org/)
start date: 02/27/2019

learn to use: 
    Series
    DataFrame
"""

In [None]:
"""
Pandas is a newer package built on top of NumPy, and provides an efficient implementation of a DataFrame. 
DataFrames are essentially multidimensional arrays with attached row and column labels, 
and often with heterogeneous types and/or missing data.

Pandas implements a number of powerful data operations familiar to
users of both database frameworks and spreadsheet programs
"""

In [None]:
"""
NumPy’s ndarray data structure provides essential features for the type of
clean, well-organized data typically seen in numerical computing tasks. 

Numpy does not work in the following situation:
(1) we need more flexibility (attaching labels to data, working with missing data, etc.)
(2) attempt perations that do not map well to element-wise broadcasting (groupings, pivots,etc.)
() less-structural data
"""

In [1]:
import pandas as pd

In [2]:
pd?   # get help of pandas

In [4]:
import numpy as np
import pandas as pd

In [6]:
## pandas Series object
x = pd.Series([0, 2, 4, 5.5])
print(x)

0    0.0
1    2.0
2    4.0
3    5.5
dtype: float64


In [7]:
"""
the Series wraps both a sequence of values and a
sequence of indices, which we can access with the values and index attributes. 
"""
x.values   ## values of a series is numpy array

array([0. , 2. , 4. , 5.5])

In [8]:
x.index  ## index of a series is an array-like object of type pd.Index

RangeIndex(start=0, stop=4, step=1)

In [9]:
pd.Index?

In [15]:
x[1]   # return the value of the corresponding index

2.0

In [12]:
x[1:3]   # results do not include x[3]

1    2.0
2    4.0
dtype: float64

In [16]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
print(data)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64


In [17]:
data['a']

0.25

In [18]:
data[1]

0.5

In [19]:
# We can even use noncontiguous or nonsequential indices:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                  index=[2, 5, 3, 7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [23]:
print(data)
print(data.shape)
print(data.size)

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64
(4,)
4


In [25]:
data[5]

0.5

In [27]:
## create pandas Series object from dictionary
## By default, a Series will be created where the index is drawn from the sorted keys.
pop_dict = { 'California': 38332521,
              'Texas': 26448193,
              'New York': 19651127,
              'Florida': 19552860,
              'Illinois': 12882135 }
pop_series = pd.Series(pop_dict)
print(pop_series)

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64


In [28]:
pop_series['California']

38332521

In [29]:
pop_series['California': 'Florida']    # results include 'California' and 'Florida'

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
dtype: int64

In [32]:
x[1:3]  # results do not include x[3]

1    2.0
2    4.0
dtype: float64

In [37]:
# Constructing Series objects
pd.Series(data)

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [39]:
ind = [3, 5, 8]
pd.Series(data, index = ind)

3    0.75
5    0.50
8     NaN
dtype: float64

In [40]:
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [43]:
pd.Series([5, 6, 7, 8], index=[100, 200, 300])

ValueError: Length of passed values is 4, index implies 3

In [44]:
pd.Series([5, 6, 7], index=[100, 200, 300])

100    5
200    6
300    7
dtype: int64

In [45]:
pd.Series({2:'a', 1:'b', 3:'c'})

2    a
1    b
3    c
dtype: object

In [46]:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

3    c
2    a
dtype: object

In [None]:
# The Pandas DataFrame Object