In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 8)

# Basics of Pandas

The ```pandas``` package provides a comprehensive set of data structures for working with and manipulating data and performing various statistical and financial analyses. Two primary data structures we will use is ```Series``` and ```DataFrame```.

## The Series
The ```Series``` is the primary building block of pandas and represents a onedimensional labeled array based on the ```NumPy ndarray```.

A ```Series``` have index labeling which makes it more usable than ```NumPy ndarray```.

A ```Series``` can hold zero or more instances of any single data type.

However a Series can only associate single value with any given index label, so it has limitations, which ```DataFrame``` is solving it.

## The DataFrame
A ```DataFrame``` can be thought of as a dictionary-like container of one or more ```Series``` objects, as a spreadsheet, or probably the best description for those new to pandas is to compare a ```DataFrame``` to a relational database table.

A ```DataFrame``` and also by automatically aligning values in each column along the index labels of the ```DataFrame```.

A ```DataFrame``` also introduces the concept of an axis, which you will often see in the pandas documentation and in many of its methods. A DataFrame has two axes, horizontal and vertical.

In [2]:
# Creating Series using Lists
lst = list("Enes Kemal")
s = pd.Series(lst)
s

0    E
1    n
2    e
3    s
    ..
6    e
7    m
8    a
9    l
dtype: object

In [3]:
# Creating Series using Dictionary
dic = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5} 
s = pd.Series(dic)
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [4]:
# Creating Series using NumPy random 
s = pd.Series(np.random.randn(100))
s

0     0.580846
1     0.200344
2     0.771276
3     0.453910
        ...   
96   -0.194114
97    0.102594
98   -0.781175
99    1.840601
dtype: float64

In [5]:
# Access elements using [] operator:
s[2]

0.77127636689613832

In [7]:
s[[2, 5, 20]] # Access specific locations with list parsed...

2     0.771276
5    -1.508865
20    1.054933
dtype: float64

In [8]:
# Slicing is possible like we are doing with lists
s[3:8]

3    0.453910
4    0.254239
5   -1.508865
6    0.235773
7    0.418655
dtype: float64

In [10]:
# Examining the series data with .head(), .tail()
s.head()

0    0.580846
1    0.200344
2    0.771276
3    0.453910
4    0.254239
dtype: float64

In [11]:
s.tail()

95   -0.399591
96   -0.194114
97    0.102594
98   -0.781175
99    1.840601
dtype: float64

In [17]:
# Index of Series can be retrieved using .index
print(list(s.index))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [18]:
# Also retrieving values are with .values 
s.values

array([  5.80845904e-01,   2.00343780e-01,   7.71276367e-01,
         4.53910024e-01,   2.54238927e-01,  -1.50886468e+00,
         2.35772835e-01,   4.18654649e-01,  -4.59828116e-02,
         5.84937530e-01,   1.05807683e+00,  -9.33295106e-01,
         2.78707566e-02,  -1.91722767e-01,   4.24611236e-02,
         2.86256314e-01,  -1.12825610e+00,   1.56128270e+00,
        -5.68958388e-01,   2.22818835e-01,   1.05493322e+00,
        -9.21252812e-01,   1.44401169e+00,  -1.08115787e+00,
        -4.20986060e-01,   1.25430911e+00,  -5.74595601e-01,
        -3.18762789e-01,  -1.02393628e+00,  -1.37117281e+00,
         9.93082282e-01,  -6.93460046e-01,  -1.80251671e-01,
         1.69393186e+00,  -2.40737287e-02,  -1.08514455e+00,
         1.21562818e-01,  -9.04255838e-01,   5.84426887e-01,
        -5.32303386e-01,   7.23991120e-01,   1.35164233e-01,
         6.38176837e-02,  -8.70822921e-01,  -2.81311290e+00,
         5.14716903e-01,  -3.13978422e-01,   7.83715621e-01,
         8.61068717e-01,

In [20]:
# Creating series with index and values are passed
s2 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2

a    1
b    2
c    3
d    4
dtype: int64

In [22]:
# Returning the length of Series
len(s)

100

In [23]:
# Return dimensionality of Series
s.shape

(100,)

In [26]:
# .count is also returning the count of elements in Series, but NaN is not counted
s = pd.Series([10, 0, 1, 1, 2, 3, 4, 5, 6, np.nan])
print(len(s))
print(s.count())

10
9


In [27]:
# Returning all the unique values using .unique()
s.unique()

array([ 10.,   0.,   1.,   2.,   3.,   4.,   5.,   6.,  nan])

In [28]:
# Count of each value:
s.value_counts()

1.0     2
10.0    1
6.0     1
5.0     1
4.0     1
3.0     1
2.0     1
0.0     1
dtype: int64

### Creating a DataFrame

In [29]:
# Create DF by passing series
df1 = pd.DataFrame([pd.Series(np.arange(10, 15)),
                    pd.Series(np.arange(15, 20))])
df1

    0   1   2   3   4
0  10  11  12  13  14
1  15  16  17  18  19

In [31]:
df1.shape

(2, 5)

In [32]:
# Another way to create DF is by using numpy array
df = pd.DataFrame(np.array([[10, 11], [20, 21]]),
                  columns=['a', 'b'])

df

    a   b
0  10  11
1  20  21

In [33]:
df.columns

Index(['a', 'b'], dtype='object')

In [35]:
# We can rename the columns after data frame created 
df.columns = ['c1', 'c2']
df

   c1  c2
0  10  11
1  20  21

In [36]:
# We can also add index while creating
df = pd.DataFrame(np.array([[0, 1], [2, 3]]), 
                  columns=['c1', 'c2'],
                  index=['r1', 'r2'])
df

    c1  c2
r1   0   1
r2   2   3

In [37]:
# We'll show the index
df.index

Index(['r1', 'r2'], dtype='object')

In [38]:
df.values

array([[0, 1],
       [2, 3]])

In [39]:
# Pandas we'll fill the gaps with NaN
s1 = pd.Series(np.arange(1, 6, 1))
s2 = pd.Series(np.arange(6, 11, 1))
s3 = pd.Series(np.arange(12, 14), index=[1, 2])
pd.DataFrame({'c1': s1, 'c2': s2, 'c3': s3})

   c1  c2    c3
0   1   6   NaN
1   2   7  12.0
2   3   8  13.0
3   4   9   NaN
4   5  10   NaN