In [1]:
import pandas as pd
import numpy as np

## The Pandas Series Object

In [2]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [3]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [4]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
data[1]

0.5

In [6]:
data[:]

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

### Series as Generalized Numpy array

In [7]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index = ['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [8]:
data['b']

0.5

### Series as specialized dictionary

In [9]:
population_dict = {'California': 38332521, 
                   'Texas': 26448193, 
                   'New York': 19651127, 
                   'Florida': 19552860,
                   'Illinois': 12882135}

population_dict

{'California': 38332521,
 'Texas': 26448193,
 'New York': 19651127,
 'Florida': 19552860,
 'Illinois': 12882135}

In [10]:
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [11]:
population['California']
population['New York']

19651127

In [12]:
population['Illinois'] = 124354243

In [13]:
population

California     38332521
Texas          26448193
New York       19651127
Florida        19552860
Illinois      124354243
dtype: int64

### More Series examples 


In [14]:
pd.Series(5, index=[100, 200])

100    5
200    5
dtype: int64

## The Pandas DataFrame Object

In [15]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
 'Florida': 170312, 'Illinois': 149995}

In [16]:
area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [45]:
states = pd.DataFrame({'population': population,
                      'area': area})

states['population'][0]

  states['population'][0]


38332521

In [18]:
states.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [19]:
states.columns

Index(['population', 'area'], dtype='object')

## Data Selection in Series

In [20]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index = ['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [21]:
# slicing by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [22]:
# slicing by implicit integer index
data[0:3]

a    0.25
b    0.50
c    0.75
dtype: float64

In [23]:
# masking
data[(data>0.3) & (data<0.8)]

b    0.50
c    0.75
dtype: float64

In [24]:
data['e'] = 1.25
data
# used for specific information
data[['a','b', 'e']]

a    0.25
b    0.50
e    1.25
dtype: float64

## Data Selection in DataFrame

In [25]:
area = pd.Series({'California': 423967, 
                  'Texas': 695662,
                  'New York': 141297, 
                  'Florida': 170312,
                  'Illinois': 149995})

pop = pd.Series({'California': 38332521, 
                 'Texas': 26448193,
                 'New York': 19651127, 
                 'Florida': 19552860,
                 'Illinois': 12882135})

## Missing Data in Pandas

In [26]:
data1 = pd.Series([1, np.nan, 'hello', None])
data1 #

0        1
1      NaN
2    hello
3     None
dtype: object

### Dropping Null Values

In [27]:
df = pd.DataFrame([[1, np.nan, 2],
                   [2, 3, 5],
                   [np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [28]:
data1.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [29]:
data1[data1.notnull()]

0        1
2    hello
dtype: object

In [30]:
df.dropna(axis='rows')

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [31]:
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [32]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [33]:
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


### Filling null values

In [34]:
data3 = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data3

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [35]:
data3.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

## Combining Datasets: Concatenation

In [36]:
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind]
    for c in cols}
    return pd.DataFrame(data, ind)

# example DataFrame
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [37]:
df1 = make_df('AB', range(1,3))
df2 = make_df('AB', range(3,5))
print(df1); print(df2);

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4


In [38]:
df1

Unnamed: 0,A,B
1,A1,B1
2,A2,B2


In [39]:
df2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4


In [40]:
pd.concat([df1, df2])

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [41]:
df3 = make_df('AB', range(0,2))
df4 = make_df('CD', range(0,2))
print(df3); print(df4);

    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1


In [42]:
pd.concat([df3, df4], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


### Contenation with joins

In [43]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
print(df5); print(df6); print(pd.concat([df5, df6]))

    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4
     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4


In [44]:
print(df5); print(df6);

    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4
